diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 9fa9cccd3e3ed..aa10cdda36ec2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -2773,6 +2773,9 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) { case Intrinsic::amdgcn_wwm: case Intrinsic::amdgcn_strict_wwm: Opcode = AMDGPU::STRICT_WWM; + CurDAG->getMachineFunction() + .getInfo() + ->setInitWholeWave(); break; case Intrinsic::amdgcn_strict_wqm: Opcode = AMDGPU::STRICT_WQM; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 926c1e4b23b4a..96d0c321704f1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1055,8 +1055,12 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { case Intrinsic::amdgcn_softwqm: return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM); case Intrinsic::amdgcn_strict_wwm: - case Intrinsic::amdgcn_wwm: + case Intrinsic::amdgcn_wwm: { + MachineFunction *MF = I.getParent()->getParent(); + SIMachineFunctionInfo *MFInfo = MF->getInfo(); + MFInfo->setInitWholeWave(); return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM); + } case Intrinsic::amdgcn_strict_wqm: return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM); case Intrinsic::amdgcn_writelane: diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 8fc32d9e60bf2..498080caf6962 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -184,7 +184,11 @@ static bool resultDependsOnExec(const MachineInstr &MI) { bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const { // Any implicit use of exec by VALU is not a real register read. return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() && - isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent()); + isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent()) && + !MO.getParent() + ->getMF() + ->getInfo() + ->hasInitWholeWave(); } bool SIInstrInfo::isSafeToSink(MachineInstr &MI, diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 9577230c6c52e..b5a8d72af5c3f 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -2825,44 +2825,44 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v6 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[4:5] -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s[4:5] -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v2, v4, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX8_DPP-NEXT: v_readlane_b32 s7, v4, 63 ; GFX8_DPP-NEXT: v_readlane_b32 s6, v3, 63 ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf @@ -2908,44 +2908,44 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v6 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[4:5] -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s[4:5] -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v2, v4, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX9_DPP-NEXT: v_readlane_b32 s7, v4, 63 ; GFX9_DPP-NEXT: v_readlane_b32 s6, v3, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf @@ -2984,76 +2984,76 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_DPP-LABEL: add_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s[0:1] -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v2, v1 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v3, v4, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v6 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v4 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v3, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v6 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v2, -1, -1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v3 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s2 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s3 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, 0, s[0:1] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v3 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v4, v5, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v7 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v5 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v7 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v1, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v7, v3, -1, -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v4, vcc, v1, v4 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v3, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s3 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v4, vcc, v4, v5 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s8, v2, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s9, v1, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s10, v1, 47 -; GFX1064_DPP-NEXT: v_writelane_b32 v7, s6, 16 -; GFX1064_DPP-NEXT: v_writelane_b32 v6, s7, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 63 -; GFX1064_DPP-NEXT: v_readlane_b32 s11, v2, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s7, v2, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v7, s8, 32 -; GFX1064_DPP-NEXT: v_writelane_b32 v6, s9, 32 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v4, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v3, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s9, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s10, v4, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 16 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s11, v3, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GFX1064_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX1064_DPP-NEXT: v_writelane_b32 v7, s11, 48 -; GFX1064_DPP-NEXT: v_writelane_b32 v6, s10, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s11, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s10, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[8:9] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 @@ -3076,8 +3076,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: v_readfirstlane_b32 s2, v8 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v6 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, v7 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, v2 ; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v9 ; GFX1064_DPP-NEXT: v_add_co_u32 v8, vcc, s2, v10 ; GFX1064_DPP-NEXT: s_mov_b32 s2, s6 @@ -3089,70 +3089,70 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP-LABEL: add_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s0 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v2, v1 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v3, v4, vcc_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v6 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v4 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v3, vcc_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v6 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v2, -1, -1 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, 0, s0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v4, v5, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v7 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v5 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v4, vcc_lo, v1, v7 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v1, v4, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v7, v3, -1, -1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s6, -1 -; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo -; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s8, v2, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v2, 31 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s7, v1, 15 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v4, vcc_lo, v4, v5 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s7, v4, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s8, v3, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v3, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s6 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s6, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v8, s8, 16 -; GFX1032_DPP-NEXT: v_writelane_b32 v7, s7, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v2, s8, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v1, s7, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s6 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB5_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s5 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s5 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, s4 ; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: s_mov_b32 s4, s2 ; GFX1032_DPP-NEXT: s_mov_b32 s5, s3 -; GFX1032_DPP-NEXT: buffer_atomic_add_x2 v[9:10], off, s[4:7], 0 glc +; GFX1032_DPP-NEXT: buffer_atomic_add_x2 v[8:9], off, s[4:7], 0 glc ; GFX1032_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl1_inv ; GFX1032_DPP-NEXT: buffer_gl0_inv @@ -3160,177 +3160,182 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s2, v9 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v7 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v10 -; GFX1032_DPP-NEXT: v_add_co_u32 v9, vcc_lo, s2, v11 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s2, v8 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v2 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v9 +; GFX1032_DPP-NEXT: v_add_co_u32 v8, vcc_lo, s2, v10 ; GFX1032_DPP-NEXT: s_mov_b32 s2, s6 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, s3, v12, vcc_lo +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo ; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: add_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v5, vcc ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v5, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v6, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v1, -1, -1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v1, v4, vcc +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v4, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s2 -; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v3, 31 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 15 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s8, v2, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s9, v1, 31 -; GFX1164_DPP-NEXT: v_writelane_b32 v4, s6, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s7, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s10, v2, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s11, v1, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v4, s8, 32 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s9, 32 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v4, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v3, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v4, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v3, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s7, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s10, v3, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s11, v4, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s9, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX1164_DPP-NEXT: v_writelane_b32 v4, s10, 48 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s11, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s10, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s11, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[8:9] ; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1164_DPP-NEXT: s_mov_b64 s[8:9], exec -; GFX1164_DPP-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB5_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s5 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4 ; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: s_mov_b32 s4, s2 ; GFX1164_DPP-NEXT: s_mov_b32 s5, s3 -; GFX1164_DPP-NEXT: buffer_atomic_add_u64 v[6:7], off, s[4:7], 0 glc +; GFX1164_DPP-NEXT: buffer_atomic_add_u64 v[7:8], off, s[4:7], 0 glc ; GFX1164_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl1_inv ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB5_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s2, v6 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v4 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v5 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v7 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s2, v7 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v8 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_add_co_u32 v6, vcc, s2, v8 +; GFX1164_DPP-NEXT: v_add_co_u32 v7, vcc, s2, v9 ; GFX1164_DPP-NEXT: s_mov_b32 s2, s6 -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v7, vcc, s3, v9, vcc +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v8, vcc, s3, v10, vcc ; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164_DPP-NEXT: buffer_store_b64 v[6:7], off, s[0:3], 0 +; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: add_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s0 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v7, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v3, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v1, v5, vcc_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v7, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v4, -1, -1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX1132_DPP-NEXT: v_readlane_b32 s4, v2, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s7, v2, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s8, v1, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v5, vcc_lo +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s8, v4, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s6 ; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v6, s7, 16 -; GFX1132_DPP-NEXT: v_writelane_b32 v7, s8, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v1, s7, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v2, s8, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s6 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1132_DPP-NEXT: s_mov_b32 s8, exec_lo @@ -3351,8 +3356,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: v_readfirstlane_b32 s2, v8 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v6 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v7 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v2 ; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v9 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_add_co_u32 v8, vcc_lo, s2, v10 @@ -3364,165 +3369,170 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1264_DPP-LABEL: add_i64_varying: ; GFX1264_DPP: ; %bb.0: ; %entry +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1264_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] -; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] -; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1264_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, v2 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc +; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v5, vcc ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1264_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1264_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1264_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 -; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc -; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1264_DPP-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v5, vcc +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v6, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_permlanex16_b32 v5, v1, -1, -1 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v1, v4, vcc +; GFX1264_DPP-NEXT: v_readlane_b32 s2, v4, 31 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, s2 -; GFX1264_DPP-NEXT: v_readlane_b32 s2, v2, 31 -; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1264_DPP-NEXT: v_readlane_b32 s2, v3, 31 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1264_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1264_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1264_DPP-NEXT: v_readlane_b32 s6, v2, 15 -; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1264_DPP-NEXT: v_readlane_b32 s7, v1, 15 -; GFX1264_DPP-NEXT: v_readlane_b32 s8, v2, 31 -; GFX1264_DPP-NEXT: v_readlane_b32 s9, v1, 31 -; GFX1264_DPP-NEXT: v_writelane_b32 v4, s6, 16 -; GFX1264_DPP-NEXT: v_readlane_b32 s6, v2, 63 -; GFX1264_DPP-NEXT: v_writelane_b32 v5, s7, 16 -; GFX1264_DPP-NEXT: v_readlane_b32 s10, v2, 47 -; GFX1264_DPP-NEXT: v_readlane_b32 s11, v1, 47 -; GFX1264_DPP-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1264_DPP-NEXT: v_writelane_b32 v4, s8, 32 -; GFX1264_DPP-NEXT: v_writelane_b32 v5, s9, 32 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_readlane_b32 s7, v4, 15 +; GFX1264_DPP-NEXT: v_readlane_b32 s8, v3, 31 +; GFX1264_DPP-NEXT: v_readlane_b32 s9, v4, 31 +; GFX1264_DPP-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1264_DPP-NEXT: v_readlane_b32 s6, v3, 63 +; GFX1264_DPP-NEXT: v_writelane_b32 v2, s7, 16 +; GFX1264_DPP-NEXT: v_readlane_b32 s10, v3, 47 +; GFX1264_DPP-NEXT: v_readlane_b32 s11, v4, 47 +; GFX1264_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX1264_DPP-NEXT: v_writelane_b32 v1, s8, 32 +; GFX1264_DPP-NEXT: v_writelane_b32 v2, s9, 32 ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GFX1264_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX1264_DPP-NEXT: v_writelane_b32 v4, s10, 48 -; GFX1264_DPP-NEXT: v_writelane_b32 v5, s11, 48 +; GFX1264_DPP-NEXT: v_writelane_b32 v1, s10, 48 +; GFX1264_DPP-NEXT: v_writelane_b32 v2, s11, 48 ; GFX1264_DPP-NEXT: s_wait_alu 0xfffe ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[8:9] ; GFX1264_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1264_DPP-NEXT: s_mov_b64 s[8:9], exec -; GFX1264_DPP-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1264_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1264_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1264_DPP-NEXT: s_cbranch_execz .LBB5_2 ; GFX1264_DPP-NEXT: ; %bb.1: -; GFX1264_DPP-NEXT: v_mov_b32_e32 v7, s5 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, s4 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v7, s4 ; GFX1264_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0 ; GFX1264_DPP-NEXT: s_mov_b32 s4, s2 ; GFX1264_DPP-NEXT: s_mov_b32 s5, s3 -; GFX1264_DPP-NEXT: buffer_atomic_add_u64 v[6:7], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1264_DPP-NEXT: buffer_atomic_add_u64 v[7:8], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0 ; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV ; GFX1264_DPP-NEXT: .LBB5_2: ; GFX1264_DPP-NEXT: s_wait_alu 0xfffe ; GFX1264_DPP-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0 -; GFX1264_DPP-NEXT: v_readfirstlane_b32 s2, v6 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v8, v4 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, v5 -; GFX1264_DPP-NEXT: v_readfirstlane_b32 s3, v7 +; GFX1264_DPP-NEXT: v_readfirstlane_b32 s2, v7 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1264_DPP-NEXT: v_readfirstlane_b32 s3, v8 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1264_DPP-NEXT: v_add_co_u32 v6, vcc, s2, v8 +; GFX1264_DPP-NEXT: v_add_co_u32 v7, vcc, s2, v9 ; GFX1264_DPP-NEXT: s_mov_b32 s2, s6 -; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v7, vcc, s3, v9, vcc +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v8, vcc, s3, v10, vcc ; GFX1264_DPP-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1264_DPP-NEXT: buffer_store_b64 v[6:7], off, s[0:3], null +; GFX1264_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], null ; GFX1264_DPP-NEXT: s_endpgm ; ; GFX1232_DPP-LABEL: add_i64_varying: ; GFX1232_DPP: ; %bb.0: ; %entry +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1232_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s0 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s0 -; GFX1232_DPP-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v7, 0 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1232_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s0 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, v2 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo +; GFX1232_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1232_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1232_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1232_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 -; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1232_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_permlanex16_b32 v7, v3, -1, -1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v1, v5, vcc_lo +; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v7, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_permlanex16_b32 v6, v4, -1, -1 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX1232_DPP-NEXT: v_readlane_b32 s4, v2, 31 -; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1232_DPP-NEXT: v_readlane_b32 s7, v2, 15 -; GFX1232_DPP-NEXT: v_readlane_b32 s8, v1, 15 -; GFX1232_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1232_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v5, vcc_lo +; GFX1232_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1232_DPP-NEXT: v_readlane_b32 s8, v4, 15 +; GFX1232_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s6 ; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1 -; GFX1232_DPP-NEXT: v_writelane_b32 v6, s7, 16 -; GFX1232_DPP-NEXT: v_writelane_b32 v7, s8, 16 +; GFX1232_DPP-NEXT: v_writelane_b32 v1, s7, 16 +; GFX1232_DPP-NEXT: v_writelane_b32 v2, s8, 16 ; GFX1232_DPP-NEXT: s_wait_alu 0xfffe ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s6 ; GFX1232_DPP-NEXT: s_mov_b32 s6, -1 @@ -3544,8 +3554,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0 ; GFX1232_DPP-NEXT: v_readfirstlane_b32 s2, v8 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v10, v6 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v11, v7 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v10, v1 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v11, v2 ; GFX1232_DPP-NEXT: v_readfirstlane_b32 s3, v9 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_add_co_u32 v8, vcc_lo, s2, v10 @@ -6419,44 +6429,44 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v6 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[4:5] -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s[4:5] -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v2, v4, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX8_DPP-NEXT: v_readlane_b32 s7, v4, 63 ; GFX8_DPP-NEXT: v_readlane_b32 s6, v3, 63 ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf @@ -6502,44 +6512,44 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v6 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[4:5] -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s[4:5] -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v2, v4, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX9_DPP-NEXT: v_readlane_b32 s7, v4, 63 ; GFX9_DPP-NEXT: v_readlane_b32 s6, v3, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf @@ -6578,76 +6588,76 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_DPP-LABEL: sub_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s[0:1] -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v2, v1 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v3, v4, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v6 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v4 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v3, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v6 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v2, -1, -1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v3 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s2 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s3 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, 0, s[0:1] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v3 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v4, v5, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v7 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v5 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v7 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v1, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v7, v3, -1, -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v4, vcc, v1, v4 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v3, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s3 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v4, vcc, v4, v5 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s8, v2, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s9, v1, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s10, v1, 47 -; GFX1064_DPP-NEXT: v_writelane_b32 v7, s6, 16 -; GFX1064_DPP-NEXT: v_writelane_b32 v6, s7, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 63 -; GFX1064_DPP-NEXT: v_readlane_b32 s11, v2, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s7, v2, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v7, s8, 32 -; GFX1064_DPP-NEXT: v_writelane_b32 v6, s9, 32 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v4, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v3, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s9, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s10, v4, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 16 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s11, v3, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GFX1064_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX1064_DPP-NEXT: v_writelane_b32 v7, s11, 48 -; GFX1064_DPP-NEXT: v_writelane_b32 v6, s10, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s11, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s10, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[8:9] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 @@ -6670,8 +6680,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: v_readfirstlane_b32 s2, v8 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v6 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, v7 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, v2 ; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v9 ; GFX1064_DPP-NEXT: v_sub_co_u32 v8, vcc, s2, v10 ; GFX1064_DPP-NEXT: s_mov_b32 s2, s6 @@ -6683,70 +6693,70 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP-LABEL: sub_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s0 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v2, v1 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v3, v4, vcc_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v6 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v4 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v3, vcc_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v6 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v2, -1, -1 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, 0, s0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v4, v5, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v7 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v5 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v4, vcc_lo, v1, v7 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v1, v4, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v7, v3, -1, -1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s6, -1 -; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo -; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s8, v2, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v2, 31 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s7, v1, 15 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v4, vcc_lo, v4, v5 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s7, v4, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s8, v3, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v3, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s6 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s6, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v8, s8, 16 -; GFX1032_DPP-NEXT: v_writelane_b32 v7, s7, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v2, s8, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v1, s7, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s6 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB11_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s5 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s5 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, s4 ; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: s_mov_b32 s4, s2 ; GFX1032_DPP-NEXT: s_mov_b32 s5, s3 -; GFX1032_DPP-NEXT: buffer_atomic_sub_x2 v[9:10], off, s[4:7], 0 glc +; GFX1032_DPP-NEXT: buffer_atomic_sub_x2 v[8:9], off, s[4:7], 0 glc ; GFX1032_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl1_inv ; GFX1032_DPP-NEXT: buffer_gl0_inv @@ -6754,177 +6764,182 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s2, v9 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v7 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v10 -; GFX1032_DPP-NEXT: v_sub_co_u32 v9, vcc_lo, s2, v11 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s2, v8 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v2 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v9 +; GFX1032_DPP-NEXT: v_sub_co_u32 v8, vcc_lo, s2, v10 ; GFX1032_DPP-NEXT: s_mov_b32 s2, s6 -; GFX1032_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc_lo, s3, v12, vcc_lo +; GFX1032_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo ; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: sub_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v5, vcc ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v5, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v6, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v1, -1, -1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v1, v4, vcc +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v4, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s2 -; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v3, 31 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 15 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s8, v2, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s9, v1, 31 -; GFX1164_DPP-NEXT: v_writelane_b32 v4, s6, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s7, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s10, v2, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s11, v1, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v4, s8, 32 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s9, 32 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v4, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v3, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v4, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v3, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s7, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s10, v3, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s11, v4, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s9, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX1164_DPP-NEXT: v_writelane_b32 v4, s10, 48 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s11, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s10, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s11, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[8:9] ; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1164_DPP-NEXT: s_mov_b64 s[8:9], exec -; GFX1164_DPP-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB11_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s5 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4 ; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: s_mov_b32 s4, s2 ; GFX1164_DPP-NEXT: s_mov_b32 s5, s3 -; GFX1164_DPP-NEXT: buffer_atomic_sub_u64 v[6:7], off, s[4:7], 0 glc +; GFX1164_DPP-NEXT: buffer_atomic_sub_u64 v[7:8], off, s[4:7], 0 glc ; GFX1164_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl1_inv ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB11_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s2, v6 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v4 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v5 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v7 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s2, v7 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v8 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_sub_co_u32 v6, vcc, s2, v8 +; GFX1164_DPP-NEXT: v_sub_co_u32 v7, vcc, s2, v9 ; GFX1164_DPP-NEXT: s_mov_b32 s2, s6 -; GFX1164_DPP-NEXT: v_sub_co_ci_u32_e32 v7, vcc, s3, v9, vcc +; GFX1164_DPP-NEXT: v_sub_co_ci_u32_e32 v8, vcc, s3, v10, vcc ; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164_DPP-NEXT: buffer_store_b64 v[6:7], off, s[0:3], 0 +; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: sub_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s0 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v7, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v3, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v1, v5, vcc_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v7, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v4, -1, -1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX1132_DPP-NEXT: v_readlane_b32 s4, v2, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s7, v2, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s8, v1, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v5, vcc_lo +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s8, v4, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s6 ; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v6, s7, 16 -; GFX1132_DPP-NEXT: v_writelane_b32 v7, s8, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v1, s7, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v2, s8, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s6 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1132_DPP-NEXT: s_mov_b32 s8, exec_lo @@ -6945,8 +6960,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: v_readfirstlane_b32 s2, v8 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v6 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v7 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v2 ; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v9 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_sub_co_u32 v8, vcc_lo, s2, v10 @@ -6958,165 +6973,170 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1264_DPP-LABEL: sub_i64_varying: ; GFX1264_DPP: ; %bb.0: ; %entry +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1264_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] -; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] -; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1264_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, v2 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc +; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v5, vcc ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1264_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1264_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1264_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 -; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc -; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1264_DPP-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v5, vcc +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v6, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_permlanex16_b32 v5, v1, -1, -1 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v1, v4, vcc +; GFX1264_DPP-NEXT: v_readlane_b32 s2, v4, 31 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, s2 -; GFX1264_DPP-NEXT: v_readlane_b32 s2, v2, 31 -; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1264_DPP-NEXT: v_readlane_b32 s2, v3, 31 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1264_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1264_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1264_DPP-NEXT: v_readlane_b32 s6, v2, 15 -; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1264_DPP-NEXT: v_readlane_b32 s7, v1, 15 -; GFX1264_DPP-NEXT: v_readlane_b32 s8, v2, 31 -; GFX1264_DPP-NEXT: v_readlane_b32 s9, v1, 31 -; GFX1264_DPP-NEXT: v_writelane_b32 v4, s6, 16 -; GFX1264_DPP-NEXT: v_readlane_b32 s6, v2, 63 -; GFX1264_DPP-NEXT: v_writelane_b32 v5, s7, 16 -; GFX1264_DPP-NEXT: v_readlane_b32 s10, v2, 47 -; GFX1264_DPP-NEXT: v_readlane_b32 s11, v1, 47 -; GFX1264_DPP-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1264_DPP-NEXT: v_writelane_b32 v4, s8, 32 -; GFX1264_DPP-NEXT: v_writelane_b32 v5, s9, 32 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_readlane_b32 s7, v4, 15 +; GFX1264_DPP-NEXT: v_readlane_b32 s8, v3, 31 +; GFX1264_DPP-NEXT: v_readlane_b32 s9, v4, 31 +; GFX1264_DPP-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1264_DPP-NEXT: v_readlane_b32 s6, v3, 63 +; GFX1264_DPP-NEXT: v_writelane_b32 v2, s7, 16 +; GFX1264_DPP-NEXT: v_readlane_b32 s10, v3, 47 +; GFX1264_DPP-NEXT: v_readlane_b32 s11, v4, 47 +; GFX1264_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX1264_DPP-NEXT: v_writelane_b32 v1, s8, 32 +; GFX1264_DPP-NEXT: v_writelane_b32 v2, s9, 32 ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GFX1264_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX1264_DPP-NEXT: v_writelane_b32 v4, s10, 48 -; GFX1264_DPP-NEXT: v_writelane_b32 v5, s11, 48 +; GFX1264_DPP-NEXT: v_writelane_b32 v1, s10, 48 +; GFX1264_DPP-NEXT: v_writelane_b32 v2, s11, 48 ; GFX1264_DPP-NEXT: s_wait_alu 0xfffe ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[8:9] ; GFX1264_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1264_DPP-NEXT: s_mov_b64 s[8:9], exec -; GFX1264_DPP-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1264_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1264_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1264_DPP-NEXT: s_cbranch_execz .LBB11_2 ; GFX1264_DPP-NEXT: ; %bb.1: -; GFX1264_DPP-NEXT: v_mov_b32_e32 v7, s5 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, s4 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v7, s4 ; GFX1264_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0 ; GFX1264_DPP-NEXT: s_mov_b32 s4, s2 ; GFX1264_DPP-NEXT: s_mov_b32 s5, s3 -; GFX1264_DPP-NEXT: buffer_atomic_sub_u64 v[6:7], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1264_DPP-NEXT: buffer_atomic_sub_u64 v[7:8], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0 ; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV ; GFX1264_DPP-NEXT: .LBB11_2: ; GFX1264_DPP-NEXT: s_wait_alu 0xfffe ; GFX1264_DPP-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0 -; GFX1264_DPP-NEXT: v_readfirstlane_b32 s2, v6 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v8, v4 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, v5 -; GFX1264_DPP-NEXT: v_readfirstlane_b32 s3, v7 +; GFX1264_DPP-NEXT: v_readfirstlane_b32 s2, v7 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1264_DPP-NEXT: v_readfirstlane_b32 s3, v8 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1264_DPP-NEXT: v_sub_co_u32 v6, vcc, s2, v8 +; GFX1264_DPP-NEXT: v_sub_co_u32 v7, vcc, s2, v9 ; GFX1264_DPP-NEXT: s_mov_b32 s2, s6 -; GFX1264_DPP-NEXT: v_sub_co_ci_u32_e32 v7, vcc, s3, v9, vcc +; GFX1264_DPP-NEXT: v_sub_co_ci_u32_e32 v8, vcc, s3, v10, vcc ; GFX1264_DPP-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1264_DPP-NEXT: buffer_store_b64 v[6:7], off, s[0:3], null +; GFX1264_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], null ; GFX1264_DPP-NEXT: s_endpgm ; ; GFX1232_DPP-LABEL: sub_i64_varying: ; GFX1232_DPP: ; %bb.0: ; %entry +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1232_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s0 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s0 -; GFX1232_DPP-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v7, 0 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1232_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s0 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, v2 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo +; GFX1232_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1232_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1232_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1232_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 -; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1232_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_permlanex16_b32 v7, v3, -1, -1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v1, v5, vcc_lo +; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v7, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_permlanex16_b32 v6, v4, -1, -1 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX1232_DPP-NEXT: v_readlane_b32 s4, v2, 31 -; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1232_DPP-NEXT: v_readlane_b32 s7, v2, 15 -; GFX1232_DPP-NEXT: v_readlane_b32 s8, v1, 15 -; GFX1232_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1232_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v5, vcc_lo +; GFX1232_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1232_DPP-NEXT: v_readlane_b32 s8, v4, 15 +; GFX1232_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s6 ; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1 -; GFX1232_DPP-NEXT: v_writelane_b32 v6, s7, 16 -; GFX1232_DPP-NEXT: v_writelane_b32 v7, s8, 16 +; GFX1232_DPP-NEXT: v_writelane_b32 v1, s7, 16 +; GFX1232_DPP-NEXT: v_writelane_b32 v2, s8, 16 ; GFX1232_DPP-NEXT: s_wait_alu 0xfffe ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s6 ; GFX1232_DPP-NEXT: s_mov_b32 s6, -1 @@ -7138,8 +7158,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0 ; GFX1232_DPP-NEXT: v_readfirstlane_b32 s2, v8 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v10, v6 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v11, v7 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v10, v1 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v11, v2 ; GFX1232_DPP-NEXT: v_readfirstlane_b32 s3, v9 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_sub_co_u32 v8, vcc_lo, s2, v10 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 45b161d7959f4..29e3e6d269933 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -759,42 +759,44 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; ; GFX8_DPP-LABEL: add_i32_varying: ; GFX8_DPP: ; %bb.0: ; %entry -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] ; GFX8_DPP-NEXT: s_nop 1 -; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 1 -; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 1 -; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 1 -; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 1 -; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_readlane_b32 s2, v1, 63 +; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s2, v2, 63 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s2 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 -; GFX8_DPP-NEXT: ds_add_rtn_u32 v0, v3, v0 +; GFX8_DPP-NEXT: ds_add_rtn_u32 v0, v0, v3 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB2_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX8_DPP-NEXT: s_mov_b32 s2, -1 ; GFX8_DPP-NEXT: v_add_u32_e32 v0, vcc, s4, v0 @@ -804,41 +806,43 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; ; GFX9_DPP-LABEL: add_i32_varying: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] ; GFX9_DPP-NEXT: s_nop 1 -; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 -; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 -; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 -; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 -; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s2, v1, 63 +; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s2, v2, 63 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s2 -; GFX9_DPP-NEXT: ds_add_rtn_u32 v0, v3, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s2 +; GFX9_DPP-NEXT: ds_add_rtn_u32 v0, v0, v3 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB2_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX9_DPP-NEXT: s_mov_b32 s2, -1 ; GFX9_DPP-NEXT: v_add_u32_e32 v0, s4, v0 @@ -872,7 +876,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s3, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -882,9 +885,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s6 ; GFX1064_DPP-NEXT: s_mov_b32 s3, s6 -; GFX1064_DPP-NEXT: ds_add_rtn_u32 v0, v4, v0 +; GFX1064_DPP-NEXT: ds_add_rtn_u32 v0, v0, v4 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB2_2: @@ -915,7 +919,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 @@ -926,8 +929,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s0 -; GFX1032_DPP-NEXT: ds_add_rtn_u32 v0, v4, v0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s0 +; GFX1032_DPP-NEXT: ds_add_rtn_u32 v0, v0, v4 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB2_2: @@ -975,9 +979,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s2, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -987,9 +990,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s6 ; GFX1164_DPP-NEXT: s_mov_b32 s3, s6 -; GFX1164_DPP-NEXT: ds_add_rtn_u32 v0, v4, v0 +; GFX1164_DPP-NEXT: ds_add_rtn_u32 v0, v0, v4 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB2_2: @@ -1026,7 +1030,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 @@ -1037,8 +1040,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s0 -; GFX1132_DPP-NEXT: ds_add_rtn_u32 v0, v4, v0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s0 +; GFX1132_DPP-NEXT: ds_add_rtn_u32 v0, v0, v4 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB2_2: @@ -1278,9 +1282,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; ; GFX8_DPP-LABEL: add_i32_varying_nouse: ; GFX8_DPP: ; %bb.0: ; %entry -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX8_DPP-NEXT: s_nop 1 @@ -1298,22 +1301,22 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX8_DPP-NEXT: v_readlane_b32 s2, v1, 63 ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX8_DPP-NEXT: s_mov_b32 s0, s2 -; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB3_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, s0 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 -; GFX8_DPP-NEXT: ds_add_u32 v2, v0 +; GFX8_DPP-NEXT: ds_add_u32 v0, v2 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB3_2: ; GFX8_DPP-NEXT: s_endpgm ; ; GFX9_DPP-LABEL: add_i32_varying_nouse: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX9_DPP-NEXT: s_nop 1 @@ -1331,12 +1334,13 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX9_DPP-NEXT: v_readlane_b32 s2, v1, 63 ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9_DPP-NEXT: s_mov_b32 s0, s2 -; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB3_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s0 -; GFX9_DPP-NEXT: ds_add_u32 v2, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, s0 +; GFX9_DPP-NEXT: ds_add_u32 v0, v2 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB3_2: ; GFX9_DPP-NEXT: s_endpgm @@ -1357,13 +1361,13 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 0 ; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_DPP-NEXT: s_add_i32 s0, s2, s3 -; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB3_2 ; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s0 ; GFX1064_DPP-NEXT: ds_add_u32 v0, v3 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1382,14 +1386,14 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1032_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB3_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: ds_add_u32 v0, v3 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_DPP-NEXT: ds_add_u32 v3, v0 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB3_2: @@ -1418,16 +1422,16 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v3 ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB3_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: ds_add_u32 v0, v3 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_DPP-NEXT: ds_add_u32 v3, v0 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB3_2: @@ -1450,14 +1454,15 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1132_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132_DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v3 ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB3_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: ds_add_u32 v0, v3 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132_DPP-NEXT: ds_add_u32 v3, v0 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB3_2: @@ -2326,420 +2331,426 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; ; GFX8_DPP-LABEL: add_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry -; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v5 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v6, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v6 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_add_u32_e32 v2, vcc, v2, v4 -; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_add_u32_e32 v2, vcc, v2, v4 -; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_add_u32_e32 v2, vcc, v2, v4 -; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_add_u32_e32 v2, vcc, v2, v4 -; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_add_u32_e32 v2, vcc, v2, v4 -; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_add_u32_e32 v2, vcc, v2, v4 -; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX8_DPP-NEXT: v_readlane_b32 s3, v1, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s2, v2, 63 -; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v2, v4, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX8_DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; GFX8_DPP-NEXT: ; implicit-def: $vgpr5_vgpr6 +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB6_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s3 -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, s2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s2 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 -; GFX8_DPP-NEXT: ds_add_rtn_u64 v[5:6], v7, v[5:6] +; GFX8_DPP-NEXT: ds_add_rtn_u64 v[6:7], v0, v[6:7] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB6_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v6 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v7 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v6 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v2 ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 -; GFX8_DPP-NEXT: v_add_u32_e32 v5, vcc, s5, v5 +; GFX8_DPP-NEXT: v_add_u32_e32 v6, vcc, s5, v6 ; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX8_DPP-NEXT: s_mov_b32 s2, -1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v6, vcc, v0, v6, vcc +; GFX8_DPP-NEXT: v_addc_u32_e32 v7, vcc, v0, v7, vcc ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0 +; GFX8_DPP-NEXT: buffer_store_dwordx2 v[6:7], off, s[0:3], 0 ; GFX8_DPP-NEXT: s_endpgm ; ; GFX9_DPP-LABEL: add_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v5 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v6, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v6 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 -; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 -; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 -; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 -; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 -; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 -; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_DPP-NEXT: v_readlane_b32 s3, v1, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s2, v2, 63 -; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v2, v4, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX9_DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; GFX9_DPP-NEXT: ; implicit-def: $vgpr5_vgpr6 +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB6_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s3 -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, s2 -; GFX9_DPP-NEXT: ds_add_rtn_u64 v[5:6], v7, v[5:6] +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s2 +; GFX9_DPP-NEXT: ds_add_rtn_u64 v[6:7], v0, v[6:7] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB6_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v6 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v7 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v6 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v2 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v5, vcc, s5, v5 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v6, vcc, s5, v6 ; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX9_DPP-NEXT: s_mov_b32 s2, -1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v6, vcc, v0, v6, vcc +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v7, vcc, v0, v7, vcc ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0 +; GFX9_DPP-NEXT: buffer_store_dwordx2 v[6:7], off, s[0:3], 0 ; GFX9_DPP-NEXT: s_endpgm ; ; GFX1064_DPP-LABEL: add_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s[0:1] -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v2, v1 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v3, v4, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v6 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v4 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v3, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v6 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v2, -1, -1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v3 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s2 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s3 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, 0, s[0:1] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v3 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v4, v5, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v7 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v5 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v7 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v1, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v7, v3, -1, -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v4, vcc, v1, v4 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v3, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s3 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v4, vcc, v4, v5 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 -; GFX1064_DPP-NEXT: v_writelane_b32 v8, s2, 16 -; GFX1064_DPP-NEXT: v_writelane_b32 v7, s3, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 63 -; GFX1064_DPP-NEXT: v_readlane_b32 s9, v2, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v8, s6, 32 -; GFX1064_DPP-NEXT: v_writelane_b32 v7, s7, 32 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v3, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v4, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v3, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v4, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s2, 16 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s3, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v4, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s9, v3, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v3, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v9, exec_hi, v0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX1064_DPP-NEXT: v_writelane_b32 v8, s9, 48 -; GFX1064_DPP-NEXT: v_writelane_b32 v7, s8, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s9, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s8, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] -; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB6_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 -; GFX1064_DPP-NEXT: ds_add_rtn_u64 v[9:10], v0, v[9:10] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s0 +; GFX1064_DPP-NEXT: ds_add_rtn_u64 v[8:9], v0, v[8:9] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB6_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v9 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, v7 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v8 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, v2 ; GFX1064_DPP-NEXT: s_mov_b32 null, 0 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v10 -; GFX1064_DPP-NEXT: v_add_co_u32 v9, vcc, s3, v11 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v9 +; GFX1064_DPP-NEXT: v_add_co_u32 v8, vcc, s3, v10 ; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc, s4, v12, vcc +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc, s4, v11, vcc ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: add_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s2 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s2 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v2, v1 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v3, v4, vcc_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v6 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v4 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v3, vcc_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v6 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v2, -1, -1 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s2 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, 0, s2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo -; GFX1032_DPP-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v4, v5, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v7 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v5 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v4, vcc_lo, v1, v7 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v1, v4, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v7, v3, -1, -1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v4, vcc_lo, v4, v5 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1032_DPP-NEXT: v_readlane_b32 s3, v4, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s0, v4, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v3, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v3, 15 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v9, exec_lo, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v8, s6, 16 -; GFX1032_DPP-NEXT: v_writelane_b32 v7, s3, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v2, s6, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v1, s3, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 -; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB6_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 -; GFX1032_DPP-NEXT: ds_add_rtn_u64 v[9:10], v0, v[9:10] +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, s0 +; GFX1032_DPP-NEXT: ds_add_rtn_u64 v[8:9], v0, v[8:9] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB6_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v9 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v7 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v8 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v2 ; GFX1032_DPP-NEXT: s_mov_b32 null, 0 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v10 -; GFX1032_DPP-NEXT: v_add_co_u32 v9, vcc_lo, s3, v11 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v9 +; GFX1032_DPP-NEXT: v_add_co_u32 v8, vcc_lo, s3, v10 ; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, s4, v12, vcc_lo +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, s4, v11, vcc_lo ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: add_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v5, vcc ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v5, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v6, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v1, -1, -1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v1, v4, vcc +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v4, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s2 -; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v3, 31 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 15 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s2, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v6, s3, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s8, v2, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s9, v1, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s6, 32 -; GFX1164_DPP-NEXT: v_writelane_b32 v6, s7, 32 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v3, 15 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v4, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v3, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v4, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s2, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s3, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v3, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v4, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s6, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s7, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s8, 48 -; GFX1164_DPP-NEXT: v_writelane_b32 v6, s9, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] -; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[6:7], exec ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB6_2 ; GFX1164_DPP-NEXT: ; %bb.1: ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s0 ; GFX1164_DPP-NEXT: ds_add_rtn_u64 v[7:8], v0, v[7:8] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2748,8 +2759,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v7 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v5 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v6 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v8 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_co_u32 v7, vcc, s3, v9 @@ -2761,64 +2772,68 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; ; GFX1132_DPP-LABEL: add_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s2 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s2 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v7, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s2 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 -; GFX1132_DPP-NEXT: v_readlane_b32 s3, v2, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s0, v2, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 15 +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v3, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v1, v5, vcc_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v7, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v4, -1, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX1132_DPP-NEXT: v_readlane_b32 s3, v3, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v5, vcc_lo +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v4, 15 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v6, s3, 16 -; GFX1132_DPP-NEXT: v_writelane_b32 v7, s6, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v1, s3, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v2, s6, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8 +; GFX1132_DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 -; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB6_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, s1 :: v_dual_mov_b32 v8, s0 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v9, s1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, s0 ; GFX1132_DPP-NEXT: ds_add_rtn_u64 v[8:9], v0, v[8:9] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv @@ -2826,8 +2841,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v8 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v6 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v7 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v2 ; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v9 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_add_co_u32 v8, vcc_lo, s3, v10 @@ -3093,50 +3108,51 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; ; GFX8_DPP-LABEL: add_i64_varying_nouse: ; GFX8_DPP: ; %bb.0: ; %entry -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v6, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v6 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_add_u32_e32 v2, vcc, v2, v4 -; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_add_u32_e32 v2, vcc, v2, v4 -; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_add_u32_e32 v2, vcc, v2, v4 -; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_add_u32_e32 v2, vcc, v2, v4 -; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_add_u32_e32 v2, vcc, v2, v4 -; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_e32 v1, vcc, v3, v1 ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_add_u32_e32 v2, vcc, v2, v4 -; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX8_DPP-NEXT: v_readlane_b32 s3, v1, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s2, v2, 63 +; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc +; GFX8_DPP-NEXT: v_readlane_b32 s3, v2, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s2, v1, 63 ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX8_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 @@ -3144,59 +3160,61 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX8_DPP-NEXT: s_cbranch_execz .LBB7_2 ; GFX8_DPP-NEXT: ; %bb.1: ; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s0 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 -; GFX8_DPP-NEXT: ds_add_u64 v5, v[6:7] +; GFX8_DPP-NEXT: ds_add_u64 v0, v[6:7] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB7_2: ; GFX8_DPP-NEXT: s_endpgm ; ; GFX9_DPP-LABEL: add_i64_varying_nouse: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v6, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v6 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 -; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 -; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 -; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 -; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 -; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_add_co_u32_e32 v1, vcc, v3, v1 ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 -; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9_DPP-NEXT: v_readlane_b32 s3, v1, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s2, v2, 63 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc +; GFX9_DPP-NEXT: v_readlane_b32 s3, v2, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s2, v1, 63 ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 @@ -3204,8 +3222,9 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX9_DPP-NEXT: s_cbranch_execz .LBB7_2 ; GFX9_DPP-NEXT: ; %bb.1: ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s0 -; GFX9_DPP-NEXT: ds_add_u64 v5, v[6:7] +; GFX9_DPP-NEXT: ds_add_u64 v0, v[6:7] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB7_2: ; GFX9_DPP-NEXT: s_endpgm @@ -3215,30 +3234,30 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s[0:1] -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, 0, s[0:1] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v2, vcc, v2, v3 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v4, v5, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v2 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v2, vcc, v2, v7 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v2, vcc, v2, v5 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc +; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v2, v1 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v3, v4, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v6 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v4 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v3, vcc -; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v6 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v3, v6, vcc ; GFX1064_DPP-NEXT: v_permlanex16_b32 v3, v1, 0, 0 ; GFX1064_DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 ; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v3 @@ -3251,17 +3270,17 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 32 ; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_DPP-NEXT: s_add_u32 s0, s3, s4 ; GFX1064_DPP-NEXT: s_addc_u32 s1, s2, s5 -; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB7_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s0 -; GFX1064_DPP-NEXT: ds_add_u64 v0, v[7:8] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s0 +; GFX1064_DPP-NEXT: ds_add_u64 v0, v[8:9] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB7_2: @@ -3272,44 +3291,44 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s0 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, 0, s0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v2, vcc_lo, v2, v3 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v4, v5, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v2 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v2, vcc_lo, v2, v7 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v2, vcc_lo, v2, v5 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v2, v1 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v3, v4, vcc_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v6 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v4 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v3, vcc_lo -; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v6 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v3, v6, vcc_lo ; GFX1032_DPP-NEXT: v_permlanex16_b32 v3, v1, 0, 0 ; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 ; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 ; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v9, exec_lo, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v1 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v2 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB7_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: ds_add_u64 v0, v[7:8] +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_DPP-NEXT: ds_add_u64 v0, v[8:9] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB7_2: @@ -3317,41 +3336,43 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; ; GFX1164_DPP-LABEL: add_i64_varying_nouse: ; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v3, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v2, 0, 0 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v3, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v4, v4 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v2, v1, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v3, 0, 0 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v1, 0, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_u32 v2, vcc, v3, v2 ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc -; GFX1164_DPP-NEXT: v_add_co_u32 v2, vcc, v2, v3 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v1, 0, 0 ; GFX1164_DPP-NEXT: v_permlane64_b32 v3, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc ; GFX1164_DPP-NEXT: v_permlane64_b32 v4, v1 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) @@ -3360,16 +3381,16 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1164_DPP-NEXT: v_add_co_u32 v2, vcc, v2, v3 ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v1, v4, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 -; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v7 +; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB7_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: ds_add_u64 v0, v[5:6] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_DPP-NEXT: ds_add_u64 v0, v[6:7] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB7_2: @@ -3377,50 +3398,53 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; ; GFX1132_DPP-LABEL: add_i64_varying_nouse: ; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v3, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v2, 0, 0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo -; GFX1132_DPP-NEXT: v_add_co_u32 v2, vcc_lo, v2, v3 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v2 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v3, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v4, v4 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v1, vcc_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v3, 0, 0 ; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v1, 0, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_u32 v2, vcc_lo, v3, v2 ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v4, vcc_lo ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, v2 -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX1132_DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v7 +; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB7_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: ds_add_u64 v0, v[5:6] +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132_DPP-NEXT: ds_add_u64 v0, v[6:7] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB7_2: @@ -4181,42 +4205,44 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; ; GFX8_DPP-LABEL: sub_i32_varying: ; GFX8_DPP: ; %bb.0: ; %entry -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] ; GFX8_DPP-NEXT: s_nop 1 -; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 1 -; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 1 -; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 1 -; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 1 -; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_readlane_b32 s2, v1, 63 +; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s2, v2, 63 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB10_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s2 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 -; GFX8_DPP-NEXT: ds_sub_rtn_u32 v0, v3, v0 +; GFX8_DPP-NEXT: ds_sub_rtn_u32 v0, v0, v3 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB10_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX8_DPP-NEXT: s_mov_b32 s2, -1 ; GFX8_DPP-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 @@ -4226,41 +4252,43 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; ; GFX9_DPP-LABEL: sub_i32_varying: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] ; GFX9_DPP-NEXT: s_nop 1 -; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 -; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 -; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 -; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 -; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s2, v1, 63 +; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s2, v2, 63 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB10_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s2 -; GFX9_DPP-NEXT: ds_sub_rtn_u32 v0, v3, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s2 +; GFX9_DPP-NEXT: ds_sub_rtn_u32 v0, v0, v3 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB10_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX9_DPP-NEXT: s_mov_b32 s2, -1 ; GFX9_DPP-NEXT: v_sub_u32_e32 v0, s4, v0 @@ -4294,7 +4322,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s3, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -4304,9 +4331,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB10_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s6 ; GFX1064_DPP-NEXT: s_mov_b32 s3, s6 -; GFX1064_DPP-NEXT: ds_sub_rtn_u32 v0, v4, v0 +; GFX1064_DPP-NEXT: ds_sub_rtn_u32 v0, v0, v4 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB10_2: @@ -4337,7 +4365,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 @@ -4348,8 +4375,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB10_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s0 -; GFX1032_DPP-NEXT: ds_sub_rtn_u32 v0, v4, v0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s0 +; GFX1032_DPP-NEXT: ds_sub_rtn_u32 v0, v0, v4 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB10_2: @@ -4397,9 +4425,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s2, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -4409,9 +4436,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB10_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s6 ; GFX1164_DPP-NEXT: s_mov_b32 s3, s6 -; GFX1164_DPP-NEXT: ds_sub_rtn_u32 v0, v4, v0 +; GFX1164_DPP-NEXT: ds_sub_rtn_u32 v0, v0, v4 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB10_2: @@ -4448,7 +4476,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 @@ -4459,8 +4486,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB10_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s0 -; GFX1132_DPP-NEXT: ds_sub_rtn_u32 v0, v4, v0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s0 +; GFX1132_DPP-NEXT: ds_sub_rtn_u32 v0, v0, v4 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB10_2: @@ -4700,9 +4728,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; ; GFX8_DPP-LABEL: sub_i32_varying_nouse: ; GFX8_DPP: ; %bb.0: ; %entry -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX8_DPP-NEXT: s_nop 1 @@ -4720,22 +4747,22 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX8_DPP-NEXT: v_readlane_b32 s2, v1, 63 ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX8_DPP-NEXT: s_mov_b32 s0, s2 -; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB11_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, s0 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 -; GFX8_DPP-NEXT: ds_sub_u32 v2, v0 +; GFX8_DPP-NEXT: ds_sub_u32 v0, v2 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB11_2: ; GFX8_DPP-NEXT: s_endpgm ; ; GFX9_DPP-LABEL: sub_i32_varying_nouse: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] ; GFX9_DPP-NEXT: s_nop 1 @@ -4753,12 +4780,13 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX9_DPP-NEXT: v_readlane_b32 s2, v1, 63 ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9_DPP-NEXT: s_mov_b32 s0, s2 -; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB11_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s0 -; GFX9_DPP-NEXT: ds_sub_u32 v2, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, s0 +; GFX9_DPP-NEXT: ds_sub_u32 v0, v2 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB11_2: ; GFX9_DPP-NEXT: s_endpgm @@ -4779,13 +4807,13 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 0 ; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_DPP-NEXT: s_add_i32 s0, s2, s3 -; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB11_2 ; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s0 ; GFX1064_DPP-NEXT: ds_sub_u32 v0, v3 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -4804,14 +4832,14 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 ; GFX1032_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB11_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: ds_sub_u32 v0, v3 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_DPP-NEXT: ds_sub_u32 v3, v0 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB11_2: @@ -4840,16 +4868,16 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v3 ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB11_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: ds_sub_u32 v0, v3 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_DPP-NEXT: ds_sub_u32 v3, v0 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB11_2: @@ -4872,14 +4900,15 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1132_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132_DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v3 ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB11_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: ds_sub_u32 v0, v3 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132_DPP-NEXT: ds_sub_u32 v3, v0 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB11_2: @@ -5775,420 +5804,426 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; ; GFX8_DPP-LABEL: sub_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry -; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v5 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v6, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v6 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_add_u32_e32 v2, vcc, v2, v4 -; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_add_u32_e32 v2, vcc, v2, v4 -; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_add_u32_e32 v2, vcc, v2, v4 -; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_add_u32_e32 v2, vcc, v2, v4 -; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_add_u32_e32 v2, vcc, v2, v4 -; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_add_u32_e32 v2, vcc, v2, v4 -; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX8_DPP-NEXT: v_readlane_b32 s3, v1, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s2, v2, 63 -; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v2, v4, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX8_DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; GFX8_DPP-NEXT: ; implicit-def: $vgpr5_vgpr6 +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB14_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s3 -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, s2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s2 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 -; GFX8_DPP-NEXT: ds_sub_rtn_u64 v[5:6], v7, v[5:6] +; GFX8_DPP-NEXT: ds_sub_rtn_u64 v[6:7], v0, v[6:7] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB14_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v6 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v7 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v6 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v2 ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 -; GFX8_DPP-NEXT: v_sub_u32_e32 v5, vcc, s5, v5 +; GFX8_DPP-NEXT: v_sub_u32_e32 v6, vcc, s5, v6 ; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX8_DPP-NEXT: s_mov_b32 s2, -1 -; GFX8_DPP-NEXT: v_subb_u32_e32 v6, vcc, v0, v6, vcc +; GFX8_DPP-NEXT: v_subb_u32_e32 v7, vcc, v0, v7, vcc ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0 +; GFX8_DPP-NEXT: buffer_store_dwordx2 v[6:7], off, s[0:3], 0 ; GFX8_DPP-NEXT: s_endpgm ; ; GFX9_DPP-LABEL: sub_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v5 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 -; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 -; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 -; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 -; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 -; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v6, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v6 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 -; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_DPP-NEXT: v_readlane_b32 s3, v1, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s2, v2, 63 -; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v2, v4, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX9_DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; GFX9_DPP-NEXT: ; implicit-def: $vgpr5_vgpr6 +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB14_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s3 -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, s2 -; GFX9_DPP-NEXT: ds_sub_rtn_u64 v[5:6], v7, v[5:6] +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s2 +; GFX9_DPP-NEXT: ds_sub_rtn_u64 v[6:7], v0, v[6:7] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB14_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v6 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v7 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v6 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v2 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 -; GFX9_DPP-NEXT: v_sub_co_u32_e32 v5, vcc, s5, v5 +; GFX9_DPP-NEXT: v_sub_co_u32_e32 v6, vcc, s5, v6 ; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX9_DPP-NEXT: s_mov_b32 s2, -1 -; GFX9_DPP-NEXT: v_subb_co_u32_e32 v6, vcc, v0, v6, vcc +; GFX9_DPP-NEXT: v_subb_co_u32_e32 v7, vcc, v0, v7, vcc ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0 +; GFX9_DPP-NEXT: buffer_store_dwordx2 v[6:7], off, s[0:3], 0 ; GFX9_DPP-NEXT: s_endpgm ; ; GFX1064_DPP-LABEL: sub_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s[0:1] -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v2, v1 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v3, v4, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v6 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v4 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v3, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v6 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v2, -1, -1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v3 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s2 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s3 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, 0, s[0:1] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v3 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v4, v5, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v7 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v5 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v7 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v1, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v7, v3, -1, -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v4, vcc, v1, v4 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v3, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s3 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v4, vcc, v4, v5 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 -; GFX1064_DPP-NEXT: v_writelane_b32 v8, s2, 16 -; GFX1064_DPP-NEXT: v_writelane_b32 v7, s3, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 63 -; GFX1064_DPP-NEXT: v_readlane_b32 s9, v2, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v8, s6, 32 -; GFX1064_DPP-NEXT: v_writelane_b32 v7, s7, 32 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v3, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v4, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v3, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v4, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s2, 16 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s3, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v4, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s9, v3, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v3, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v9, exec_hi, v0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX1064_DPP-NEXT: v_writelane_b32 v8, s9, 48 -; GFX1064_DPP-NEXT: v_writelane_b32 v7, s8, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s9, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s8, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] -; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB14_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 -; GFX1064_DPP-NEXT: ds_sub_rtn_u64 v[9:10], v0, v[9:10] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s0 +; GFX1064_DPP-NEXT: ds_sub_rtn_u64 v[8:9], v0, v[8:9] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB14_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v9 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, v7 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v8 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, v2 ; GFX1064_DPP-NEXT: s_mov_b32 null, 0 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v10 -; GFX1064_DPP-NEXT: v_sub_co_u32 v9, vcc, s3, v11 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v9 +; GFX1064_DPP-NEXT: v_sub_co_u32 v8, vcc, s3, v10 ; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc, s4, v12, vcc +; GFX1064_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc, s4, v11, vcc ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: sub_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s2 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s2 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v2, v1 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v3, v4, vcc_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v6 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v4 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v3, vcc_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v6 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v2, -1, -1 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s2 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, 0, s2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo -; GFX1032_DPP-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v4, v5, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v7 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v5 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v4, vcc_lo, v1, v7 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v1, v4, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v7, v3, -1, -1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v4, vcc_lo, v4, v5 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1032_DPP-NEXT: v_readlane_b32 s3, v4, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s0, v4, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v3, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v3, 15 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v9, exec_lo, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v8, s6, 16 -; GFX1032_DPP-NEXT: v_writelane_b32 v7, s3, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v2, s6, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v1, s3, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 -; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB14_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 -; GFX1032_DPP-NEXT: ds_sub_rtn_u64 v[9:10], v0, v[9:10] +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, s0 +; GFX1032_DPP-NEXT: ds_sub_rtn_u64 v[8:9], v0, v[8:9] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB14_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v9 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v7 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v8 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v2 ; GFX1032_DPP-NEXT: s_mov_b32 null, 0 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v10 -; GFX1032_DPP-NEXT: v_sub_co_u32 v9, vcc_lo, s3, v11 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v9 +; GFX1032_DPP-NEXT: v_sub_co_u32 v8, vcc_lo, s3, v10 ; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc_lo, s4, v12, vcc_lo +; GFX1032_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, s4, v11, vcc_lo ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: sub_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v5, vcc ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v5, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v6, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v1, -1, -1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v1, v4, vcc +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v4, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s2 -; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v3, 31 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 15 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s2, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v6, s3, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s8, v2, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s9, v1, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s6, 32 -; GFX1164_DPP-NEXT: v_writelane_b32 v6, s7, 32 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v3, 15 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v4, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v3, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v4, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s2, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s3, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v3, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v4, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s6, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s7, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s8, 48 -; GFX1164_DPP-NEXT: v_writelane_b32 v6, s9, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] -; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[6:7], exec ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB14_2 ; GFX1164_DPP-NEXT: ; %bb.1: ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s0 ; GFX1164_DPP-NEXT: ds_sub_rtn_u64 v[7:8], v0, v[7:8] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6197,8 +6232,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v7 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v5 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v6 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v8 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_sub_co_u32 v7, vcc, s3, v9 @@ -6210,64 +6245,68 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; ; GFX1132_DPP-LABEL: sub_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s2 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s2 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v7, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s2 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 -; GFX1132_DPP-NEXT: v_readlane_b32 s3, v2, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s0, v2, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 15 +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v3, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v1, v5, vcc_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v7, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v4, -1, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX1132_DPP-NEXT: v_readlane_b32 s3, v3, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v5, vcc_lo +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v4, 15 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v6, s3, 16 -; GFX1132_DPP-NEXT: v_writelane_b32 v7, s6, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v1, s3, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v2, s6, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8 +; GFX1132_DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 -; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB14_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, s1 :: v_dual_mov_b32 v8, s0 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v9, s1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, s0 ; GFX1132_DPP-NEXT: ds_sub_rtn_u64 v[8:9], v0, v[8:9] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv @@ -6275,8 +6314,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v8 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v6 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v7 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v2 ; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v9 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_sub_co_u32 v8, vcc_lo, s3, v10 @@ -7251,7 +7290,6 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-LABEL: and_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v5 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, -1, 0, s[0:1] @@ -7259,7 +7297,7 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 0 @@ -7285,9 +7323,10 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_cbranch_execz .LBB16_2 ; GFX8_DPP-NEXT: ; %bb.1: ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, s2 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 -; GFX8_DPP-NEXT: ds_and_rtn_b64 v[5:6], v7, v[5:6] +; GFX8_DPP-NEXT: ds_and_rtn_b64 v[5:6], v0, v[5:6] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB16_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] @@ -7307,7 +7346,6 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-LABEL: and_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v5 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, -1, 0, s[0:1] @@ -7315,7 +7353,7 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 0 @@ -7341,8 +7379,9 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_cbranch_execz .LBB16_2 ; GFX9_DPP-NEXT: ; %bb.1: ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, s2 -; GFX9_DPP-NEXT: ds_and_rtn_b64 v[5:6], v7, v[5:6] +; GFX9_DPP-NEXT: ds_and_rtn_b64 v[5:6], v0, v[5:6] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB16_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] @@ -7364,8 +7403,6 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, -1, 0, s[0:1] ; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s[0:1] -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, -1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, -1 ; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf @@ -7378,62 +7415,64 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 ; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, -1 ; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 ; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 31 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s2 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s3 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, s3 ; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v4 +; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v5, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 15 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 15 ; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 31 ; GFX1064_DPP-NEXT: v_readlane_b32 s7, v2, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v6, s2, 16 +; GFX1064_DPP-NEXT: v_writelane_b32 v4, s2, 16 ; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s3, 16 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 ; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 63 ; GFX1064_DPP-NEXT: v_readlane_b32 s9, v2, 47 -; GFX1064_DPP-NEXT: v_writelane_b32 v6, s6, 32 -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s7, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v4, s6, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s7, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX1064_DPP-NEXT: v_writelane_b32 v6, s8, 48 -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s9, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v4, s8, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s9, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] -; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1064_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB16_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s0 -; GFX1064_DPP-NEXT: ds_and_rtn_b64 v[7:8], v0, v[7:8] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s0 +; GFX1064_DPP-NEXT: ds_and_rtn_b64 v[6:7], v0, v[6:7] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB16_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v8 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v5 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v6 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v7 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1064_DPP-NEXT: s_mov_b32 null, 0 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v7 -; GFX1064_DPP-NEXT: v_and_b32_e32 v9, s3, v9 -; GFX1064_DPP-NEXT: v_and_b32_e32 v8, s4, v8 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v6 +; GFX1064_DPP-NEXT: v_and_b32_e32 v8, s3, v8 +; GFX1064_DPP-NEXT: v_and_b32_e32 v7, s4, v7 ; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: and_i64_varying: @@ -7441,8 +7480,7 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, -1, 0, s2 ; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s2 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, -1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, -1 ; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf @@ -7452,47 +7490,48 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 ; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v4 ; GFX1032_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 31 ; GFX1032_DPP-NEXT: v_readlane_b32 s0, v2, 31 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v6, s3, 16 -; GFX1032_DPP-NEXT: v_writelane_b32 v5, s6, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v4, s3, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s6, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 -; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB16_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, s1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, s0 -; GFX1032_DPP-NEXT: ds_and_rtn_b64 v[7:8], v0, v[7:8] +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, s0 +; GFX1032_DPP-NEXT: ds_and_rtn_b64 v[6:7], v0, v[6:7] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB16_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v8 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v5 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v6 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v7 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1032_DPP-NEXT: s_mov_b32 null, 0 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v7 -; GFX1032_DPP-NEXT: v_and_b32_e32 v9, s3, v9 -; GFX1032_DPP-NEXT: v_and_b32_e32 v8, s4, v8 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v6 +; GFX1032_DPP-NEXT: v_and_b32_e32 v8, s3, v8 +; GFX1032_DPP-NEXT: v_and_b32_e32 v7, s4, v7 ; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: and_i64_varying: @@ -7502,9 +7541,7 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, -1, 0, s[0:1] ; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, -1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -7522,66 +7559,68 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 ; GFX1164_DPP-NEXT: v_readlane_b32 s3, v2, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s2 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s3 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, s3 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v4 +; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v5, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 15 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s3, v2, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 31 -; GFX1164_DPP-NEXT: v_writelane_b32 v6, s2, 16 +; GFX1164_DPP-NEXT: v_writelane_b32 v4, s2, 16 ; GFX1164_DPP-NEXT: v_readlane_b32 s7, v2, 31 ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s3, 16 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47 ; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v6, s6, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v4, s6, 32 ; GFX1164_DPP-NEXT: v_readlane_b32 s9, v2, 47 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s7, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s7, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX1164_DPP-NEXT: v_writelane_b32 v6, s8, 48 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s9, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v4, s8, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] -; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1164_DPP-NEXT: s_mov_b64 s[6:7], exec +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB16_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s0 -; GFX1164_DPP-NEXT: ds_and_rtn_b64 v[7:8], v0, v[7:8] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s0 +; GFX1164_DPP-NEXT: ds_and_rtn_b64 v[6:7], v0, v[6:7] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB16_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v8 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v5 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v6 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v7 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v7 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v6 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_and_b32_e32 v9, s3, v9 -; GFX1164_DPP-NEXT: v_and_b32_e32 v8, s4, v8 +; GFX1164_DPP-NEXT: v_and_b32_e32 v8, s3, v8 +; GFX1164_DPP-NEXT: v_and_b32_e32 v7, s4, v7 ; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0 +; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: and_i64_varying: @@ -7591,7 +7630,7 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, -1, 0, s2 ; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s2 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, -1 :: v_dual_mov_b32 v5, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf @@ -7606,48 +7645,49 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX1132_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 31 ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v2, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v6, s3, 16 -; GFX1132_DPP-NEXT: v_writelane_b32 v5, s6, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v4, s3, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s6, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 -; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1132_DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB16_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0 -; GFX1132_DPP-NEXT: ds_and_rtn_b64 v[7:8], v0, v[7:8] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v7, s1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, s0 +; GFX1132_DPP-NEXT: ds_and_rtn_b64 v[6:7], v0, v[6:7] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB16_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v8 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v5 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v6 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v7 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v7 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v6 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_and_b32_e32 v9, s3, v9 -; GFX1132_DPP-NEXT: v_and_b32_e32 v8, s4, v8 +; GFX1132_DPP-NEXT: v_and_b32_e32 v8, s3, v8 +; GFX1132_DPP-NEXT: v_and_b32_e32 v7, s4, v7 ; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0 +; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -7966,42 +8006,44 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; ; GFX8_DPP-LABEL: or_i32_varying: ; GFX8_DPP: ; %bb.0: ; %entry -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 1 -; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 1 -; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 1 -; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 1 -; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 1 -; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_readlane_b32 s2, v1, 63 +; GFX8_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s2, v2, 63 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB17_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s2 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 -; GFX8_DPP-NEXT: ds_or_rtn_b32 v0, v3, v0 +; GFX8_DPP-NEXT: ds_or_rtn_b32 v0, v0, v3 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB17_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX8_DPP-NEXT: s_mov_b32 s2, -1 ; GFX8_DPP-NEXT: v_or_b32_e32 v0, s4, v0 @@ -8011,41 +8053,43 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; ; GFX9_DPP-LABEL: or_i32_varying: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] ; GFX9_DPP-NEXT: s_nop 1 -; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 -; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 -; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 -; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 -; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s2, v1, 63 +; GFX9_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s2, v2, 63 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB17_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s2 -; GFX9_DPP-NEXT: ds_or_rtn_b32 v0, v3, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s2 +; GFX9_DPP-NEXT: ds_or_rtn_b32 v0, v0, v3 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB17_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX9_DPP-NEXT: s_mov_b32 s2, -1 ; GFX9_DPP-NEXT: v_or_b32_e32 v0, s4, v0 @@ -8079,7 +8123,6 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s3, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -8089,9 +8132,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB17_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s6 ; GFX1064_DPP-NEXT: s_mov_b32 s3, s6 -; GFX1064_DPP-NEXT: ds_or_rtn_b32 v0, v4, v0 +; GFX1064_DPP-NEXT: ds_or_rtn_b32 v0, v0, v4 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB17_2: @@ -8122,7 +8166,6 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 @@ -8133,8 +8176,9 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB17_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s0 -; GFX1032_DPP-NEXT: ds_or_rtn_b32 v0, v4, v0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s0 +; GFX1032_DPP-NEXT: ds_or_rtn_b32 v0, v0, v4 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB17_2: @@ -8182,9 +8226,8 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s2, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -8194,9 +8237,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB17_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s6 ; GFX1164_DPP-NEXT: s_mov_b32 s3, s6 -; GFX1164_DPP-NEXT: ds_or_rtn_b32 v0, v4, v0 +; GFX1164_DPP-NEXT: ds_or_rtn_b32 v0, v0, v4 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB17_2: @@ -8233,7 +8277,6 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 @@ -8244,8 +8287,9 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB17_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s0 -; GFX1132_DPP-NEXT: ds_or_rtn_b32 v0, v4, v0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s0 +; GFX1132_DPP-NEXT: ds_or_rtn_b32 v0, v0, v4 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB17_2: @@ -8614,34 +8658,36 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; ; GFX8_DPP-LABEL: or_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v5 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s[0:1] +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] +; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_or_b32_dpp v4, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_readlane_b32 s3, v1, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s2, v2, 63 -; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr5_vgpr6 @@ -8649,17 +8695,18 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_cbranch_execz .LBB18_2 ; GFX8_DPP-NEXT: ; %bb.1: ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, s2 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 -; GFX8_DPP-NEXT: ds_or_rtn_b64 v[5:6], v7, v[5:6] +; GFX8_DPP-NEXT: ds_or_rtn_b64 v[5:6], v0, v[5:6] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB18_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v6 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX8_DPP-NEXT: s_mov_b32 s2, -1 ; GFX8_DPP-NEXT: v_or_b32_e32 v6, s4, v6 @@ -8670,34 +8717,36 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; ; GFX9_DPP-LABEL: or_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v5 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s[0:1] +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] +; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_or_b32_dpp v4, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s3, v1, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s2, v2, 63 -; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr5_vgpr6 @@ -8705,16 +8754,17 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_cbranch_execz .LBB18_2 ; GFX9_DPP-NEXT: ; %bb.1: ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, s2 -; GFX9_DPP-NEXT: ds_or_rtn_b64 v[5:6], v7, v[5:6] +; GFX9_DPP-NEXT: ds_or_rtn_b64 v[5:6], v0, v[5:6] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB18_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v6 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX9_DPP-NEXT: s_mov_b32 s2, -1 ; GFX9_DPP-NEXT: v_or_b32_e32 v6, s4, v6 @@ -8728,8 +8778,6 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] ; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -8742,62 +8790,64 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 ; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 ; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 31 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s2 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s3 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, s3 ; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v4 +; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v5, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 15 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 15 ; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 31 ; GFX1064_DPP-NEXT: v_readlane_b32 s7, v2, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v6, s2, 16 +; GFX1064_DPP-NEXT: v_writelane_b32 v4, s2, 16 ; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s3, 16 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 ; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 63 ; GFX1064_DPP-NEXT: v_readlane_b32 s9, v2, 47 -; GFX1064_DPP-NEXT: v_writelane_b32 v6, s6, 32 -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s7, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v4, s6, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s7, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX1064_DPP-NEXT: v_writelane_b32 v6, s8, 48 -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s9, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v4, s8, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s9, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] -; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1064_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB18_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s0 -; GFX1064_DPP-NEXT: ds_or_rtn_b64 v[7:8], v0, v[7:8] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s0 +; GFX1064_DPP-NEXT: ds_or_rtn_b64 v[6:7], v0, v[6:7] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB18_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v8 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v5 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v6 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v7 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1064_DPP-NEXT: s_mov_b32 null, 0 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v7 -; GFX1064_DPP-NEXT: v_or_b32_e32 v9, s3, v9 -; GFX1064_DPP-NEXT: v_or_b32_e32 v8, s4, v8 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v6 +; GFX1064_DPP-NEXT: v_or_b32_e32 v8, s3, v8 +; GFX1064_DPP-NEXT: v_or_b32_e32 v7, s4, v7 ; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: or_i64_varying: @@ -8805,8 +8855,7 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s2 ; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s2 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -8816,47 +8865,48 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 ; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v4 ; GFX1032_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 31 ; GFX1032_DPP-NEXT: v_readlane_b32 s0, v2, 31 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v6, s3, 16 -; GFX1032_DPP-NEXT: v_writelane_b32 v5, s6, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v4, s3, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s6, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 -; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB18_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, s1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, s0 -; GFX1032_DPP-NEXT: ds_or_rtn_b64 v[7:8], v0, v[7:8] +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, s0 +; GFX1032_DPP-NEXT: ds_or_rtn_b64 v[6:7], v0, v[6:7] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB18_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v8 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v5 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v6 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v7 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1032_DPP-NEXT: s_mov_b32 null, 0 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v7 -; GFX1032_DPP-NEXT: v_or_b32_e32 v9, s3, v9 -; GFX1032_DPP-NEXT: v_or_b32_e32 v8, s4, v8 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v6 +; GFX1032_DPP-NEXT: v_or_b32_e32 v8, s3, v8 +; GFX1032_DPP-NEXT: v_or_b32_e32 v7, s4, v7 ; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: or_i64_varying: @@ -8866,9 +8916,7 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] ; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -8886,66 +8934,68 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 ; GFX1164_DPP-NEXT: v_readlane_b32 s3, v2, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s2 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s3 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, s3 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v4 +; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v5, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 15 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s3, v2, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 31 -; GFX1164_DPP-NEXT: v_writelane_b32 v6, s2, 16 +; GFX1164_DPP-NEXT: v_writelane_b32 v4, s2, 16 ; GFX1164_DPP-NEXT: v_readlane_b32 s7, v2, 31 ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s3, 16 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47 ; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v6, s6, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v4, s6, 32 ; GFX1164_DPP-NEXT: v_readlane_b32 s9, v2, 47 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s7, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s7, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX1164_DPP-NEXT: v_writelane_b32 v6, s8, 48 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s9, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v4, s8, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] -; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1164_DPP-NEXT: s_mov_b64 s[6:7], exec +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB18_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s0 -; GFX1164_DPP-NEXT: ds_or_rtn_b64 v[7:8], v0, v[7:8] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s0 +; GFX1164_DPP-NEXT: ds_or_rtn_b64 v[6:7], v0, v[6:7] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB18_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v8 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v5 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v6 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v7 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v7 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v6 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_or_b32_e32 v9, s3, v9 -; GFX1164_DPP-NEXT: v_or_b32_e32 v8, s4, v8 +; GFX1164_DPP-NEXT: v_or_b32_e32 v8, s3, v8 +; GFX1164_DPP-NEXT: v_or_b32_e32 v7, s4, v7 ; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0 +; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: or_i64_varying: @@ -8955,7 +9005,7 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s2 ; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s2 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v5, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -8970,48 +9020,49 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX1132_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 31 ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v2, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v6, s3, 16 -; GFX1132_DPP-NEXT: v_writelane_b32 v5, s6, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v4, s3, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s6, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 -; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1132_DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB18_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0 -; GFX1132_DPP-NEXT: ds_or_rtn_b64 v[7:8], v0, v[7:8] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v7, s1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, s0 +; GFX1132_DPP-NEXT: ds_or_rtn_b64 v[6:7], v0, v[6:7] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB18_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v8 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v5 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v6 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v7 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v7 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v6 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_or_b32_e32 v9, s3, v9 -; GFX1132_DPP-NEXT: v_or_b32_e32 v8, s4, v8 +; GFX1132_DPP-NEXT: v_or_b32_e32 v8, s3, v8 +; GFX1132_DPP-NEXT: v_or_b32_e32 v7, s4, v7 ; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0 +; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -9330,42 +9381,44 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; ; GFX8_DPP-LABEL: xor_i32_varying: ; GFX8_DPP: ; %bb.0: ; %entry -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] ; GFX8_DPP-NEXT: s_nop 1 -; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 1 -; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 1 -; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 1 -; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 1 -; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_readlane_b32 s2, v1, 63 +; GFX8_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s2, v2, 63 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB19_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s2 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 -; GFX8_DPP-NEXT: ds_xor_rtn_b32 v0, v3, v0 +; GFX8_DPP-NEXT: ds_xor_rtn_b32 v0, v0, v3 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB19_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX8_DPP-NEXT: s_mov_b32 s2, -1 ; GFX8_DPP-NEXT: v_xor_b32_e32 v0, s4, v0 @@ -9375,41 +9428,43 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; ; GFX9_DPP-LABEL: xor_i32_varying: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] ; GFX9_DPP-NEXT: s_nop 1 -; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 -; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 -; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 -; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 -; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s2, v1, 63 +; GFX9_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s2, v2, 63 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB19_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s2 -; GFX9_DPP-NEXT: ds_xor_rtn_b32 v0, v3, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s2 +; GFX9_DPP-NEXT: ds_xor_rtn_b32 v0, v0, v3 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB19_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX9_DPP-NEXT: s_mov_b32 s2, -1 ; GFX9_DPP-NEXT: v_xor_b32_e32 v0, s4, v0 @@ -9443,7 +9498,6 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s3, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -9453,9 +9507,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB19_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s6 ; GFX1064_DPP-NEXT: s_mov_b32 s3, s6 -; GFX1064_DPP-NEXT: ds_xor_rtn_b32 v0, v4, v0 +; GFX1064_DPP-NEXT: ds_xor_rtn_b32 v0, v0, v4 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB19_2: @@ -9486,7 +9541,6 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 @@ -9497,8 +9551,9 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB19_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s0 -; GFX1032_DPP-NEXT: ds_xor_rtn_b32 v0, v4, v0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s0 +; GFX1032_DPP-NEXT: ds_xor_rtn_b32 v0, v0, v4 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB19_2: @@ -9546,9 +9601,8 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s2, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -9558,9 +9612,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB19_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s6 ; GFX1164_DPP-NEXT: s_mov_b32 s3, s6 -; GFX1164_DPP-NEXT: ds_xor_rtn_b32 v0, v4, v0 +; GFX1164_DPP-NEXT: ds_xor_rtn_b32 v0, v0, v4 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB19_2: @@ -9597,7 +9652,6 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 @@ -9608,8 +9662,9 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB19_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s0 -; GFX1132_DPP-NEXT: ds_xor_rtn_b32 v0, v4, v0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s0 +; GFX1132_DPP-NEXT: ds_xor_rtn_b32 v0, v0, v4 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB19_2: @@ -9978,34 +10033,36 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; ; GFX8_DPP-LABEL: xor_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v5 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s[0:1] +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] +; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_readlane_b32 s3, v1, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s2, v2, 63 -; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr5_vgpr6 @@ -10013,17 +10070,18 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_cbranch_execz .LBB20_2 ; GFX8_DPP-NEXT: ; %bb.1: ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, s2 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 -; GFX8_DPP-NEXT: ds_xor_rtn_b64 v[5:6], v7, v[5:6] +; GFX8_DPP-NEXT: ds_xor_rtn_b64 v[5:6], v0, v[5:6] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB20_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v6 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX8_DPP-NEXT: s_mov_b32 s2, -1 ; GFX8_DPP-NEXT: v_xor_b32_e32 v6, s4, v6 @@ -10034,34 +10092,36 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; ; GFX9_DPP-LABEL: xor_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v5 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s[0:1] +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] +; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s3, v1, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s2, v2, 63 -; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr5_vgpr6 @@ -10069,16 +10129,17 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_cbranch_execz .LBB20_2 ; GFX9_DPP-NEXT: ; %bb.1: ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, s2 -; GFX9_DPP-NEXT: ds_xor_rtn_b64 v[5:6], v7, v[5:6] +; GFX9_DPP-NEXT: ds_xor_rtn_b64 v[5:6], v0, v[5:6] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB20_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v6 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX9_DPP-NEXT: s_mov_b32 s2, -1 ; GFX9_DPP-NEXT: v_xor_b32_e32 v6, s4, v6 @@ -10092,8 +10153,6 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] ; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -10106,62 +10165,64 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 ; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 31 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s2 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s3 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, s3 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v4 +; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v5, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 15 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 15 ; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 31 ; GFX1064_DPP-NEXT: v_readlane_b32 s7, v2, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v6, s2, 16 +; GFX1064_DPP-NEXT: v_writelane_b32 v4, s2, 16 ; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s3, 16 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 ; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 63 ; GFX1064_DPP-NEXT: v_readlane_b32 s9, v2, 47 -; GFX1064_DPP-NEXT: v_writelane_b32 v6, s6, 32 -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s7, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v4, s6, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s7, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX1064_DPP-NEXT: v_writelane_b32 v6, s8, 48 -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s9, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v4, s8, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s9, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] -; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1064_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB20_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s0 -; GFX1064_DPP-NEXT: ds_xor_rtn_b64 v[7:8], v0, v[7:8] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s0 +; GFX1064_DPP-NEXT: ds_xor_rtn_b64 v[6:7], v0, v[6:7] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB20_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v8 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v5 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v6 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v7 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1064_DPP-NEXT: s_mov_b32 null, 0 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v7 -; GFX1064_DPP-NEXT: v_xor_b32_e32 v9, s3, v9 -; GFX1064_DPP-NEXT: v_xor_b32_e32 v8, s4, v8 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v6 +; GFX1064_DPP-NEXT: v_xor_b32_e32 v8, s3, v8 +; GFX1064_DPP-NEXT: v_xor_b32_e32 v7, s4, v7 ; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: xor_i64_varying: @@ -10169,8 +10230,7 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s2 ; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s2 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -10180,47 +10240,48 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v4 ; GFX1032_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 31 ; GFX1032_DPP-NEXT: v_readlane_b32 s0, v2, 31 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v6, s3, 16 -; GFX1032_DPP-NEXT: v_writelane_b32 v5, s6, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v4, s3, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s6, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 -; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB20_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, s1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, s0 -; GFX1032_DPP-NEXT: ds_xor_rtn_b64 v[7:8], v0, v[7:8] +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, s0 +; GFX1032_DPP-NEXT: ds_xor_rtn_b64 v[6:7], v0, v[6:7] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB20_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v8 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v5 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v6 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v7 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1032_DPP-NEXT: s_mov_b32 null, 0 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v7 -; GFX1032_DPP-NEXT: v_xor_b32_e32 v9, s3, v9 -; GFX1032_DPP-NEXT: v_xor_b32_e32 v8, s4, v8 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v6 +; GFX1032_DPP-NEXT: v_xor_b32_e32 v8, s3, v8 +; GFX1032_DPP-NEXT: v_xor_b32_e32 v7, s4, v7 ; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: xor_i64_varying: @@ -10230,9 +10291,7 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] ; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -10250,66 +10309,68 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 ; GFX1164_DPP-NEXT: v_readlane_b32 s3, v2, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s2 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s3 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, s3 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v4 +; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v5, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 15 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s3, v2, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 31 -; GFX1164_DPP-NEXT: v_writelane_b32 v6, s2, 16 +; GFX1164_DPP-NEXT: v_writelane_b32 v4, s2, 16 ; GFX1164_DPP-NEXT: v_readlane_b32 s7, v2, 31 ; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s3, 16 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47 ; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v6, s6, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v4, s6, 32 ; GFX1164_DPP-NEXT: v_readlane_b32 s9, v2, 47 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s7, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s7, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX1164_DPP-NEXT: v_writelane_b32 v6, s8, 48 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s9, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v4, s8, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] -; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1164_DPP-NEXT: s_mov_b64 s[6:7], exec +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB20_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s0 -; GFX1164_DPP-NEXT: ds_xor_rtn_b64 v[7:8], v0, v[7:8] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s0 +; GFX1164_DPP-NEXT: ds_xor_rtn_b64 v[6:7], v0, v[6:7] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB20_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v8 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v5 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v6 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v7 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v7 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v6 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_xor_b32_e32 v9, s3, v9 -; GFX1164_DPP-NEXT: v_xor_b32_e32 v8, s4, v8 +; GFX1164_DPP-NEXT: v_xor_b32_e32 v8, s3, v8 +; GFX1164_DPP-NEXT: v_xor_b32_e32 v7, s4, v7 ; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0 +; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: xor_i64_varying: @@ -10319,7 +10380,7 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s2 ; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s2 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v5, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -10334,48 +10395,49 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX1132_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 31 ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v2, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v6, s3, 16 -; GFX1132_DPP-NEXT: v_writelane_b32 v5, s6, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v4, s3, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s6, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 -; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1132_DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB20_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0 -; GFX1132_DPP-NEXT: ds_xor_rtn_b64 v[7:8], v0, v[7:8] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v7, s1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, s0 +; GFX1132_DPP-NEXT: ds_xor_rtn_b64 v[6:7], v0, v[6:7] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB20_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v8 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v5 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v6 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v7 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v7 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v6 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_xor_b32_e32 v9, s3, v9 -; GFX1132_DPP-NEXT: v_xor_b32_e32 v8, s4, v8 +; GFX1132_DPP-NEXT: v_xor_b32_e32 v8, s3, v8 +; GFX1132_DPP-NEXT: v_xor_b32_e32 v7, s4, v7 ; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0 +; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -11629,55 +11691,57 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; ; GFX8_DPP-LABEL: max_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v9, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX8_DPP-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, v2, 0, s[0:1] ; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] -; GFX8_DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX8_DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX8_DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX8_DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX8_DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX8_DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf @@ -11695,9 +11759,10 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_cbranch_execz .LBB23_2 ; GFX8_DPP-NEXT: ; %bb.1: ; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s2 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 -; GFX8_DPP-NEXT: ds_max_rtn_i64 v[7:8], v9, v[7:8] +; GFX8_DPP-NEXT: ds_max_rtn_i64 v[7:8], v0, v[7:8] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB23_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] @@ -11719,55 +11784,57 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; ; GFX9_DPP-LABEL: max_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v9, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9_DPP-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, v2, 0, s[0:1] ; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] -; GFX9_DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9_DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9_DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9_DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9_DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9_DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf @@ -11785,8 +11852,9 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_cbranch_execz .LBB23_2 ; GFX9_DPP-NEXT: ; %bb.1: ; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s2 -; GFX9_DPP-NEXT: ds_max_rtn_i64 v[7:8], v9, v[7:8] +; GFX9_DPP-NEXT: ds_max_rtn_i64 v[7:8], v0, v[7:8] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB23_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] @@ -11809,92 +11877,92 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-LABEL: max_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, 0, s[0:1] -; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] -; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[3:4] -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6] -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[3:4] -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6] -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v1, -1, -1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[3:4] -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, s2 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s3 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[3:4] -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, 0, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1064_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v3, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s3 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 15 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s2, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v4, s3, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s8, v2, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 63 -; GFX1064_DPP-NEXT: v_readlane_b32 s9, v1, 47 -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s6, 32 -; GFX1064_DPP-NEXT: v_writelane_b32 v4, s7, 32 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v4, 15 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v3, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s2, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s3, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v4, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s9, v3, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s8, 48 -; GFX1064_DPP-NEXT: v_writelane_b32 v4, s9, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] -; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1064_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB23_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s0 -; GFX1064_DPP-NEXT: ds_max_rtn_i64 v[7:8], v0, v[7:8] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1064_DPP-NEXT: ds_max_rtn_i64 v[9:10], v0, v[9:10] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB23_2: @@ -11902,84 +11970,84 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064_DPP-NEXT: s_mov_b32 null, 0 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s5, v8 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v7 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v4 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v5 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s5, v10 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v9 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[7:8] -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc +; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[9:10] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: max_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, 0, s2 -; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s2 -; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[1:2], v[3:4] -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo -; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[1:2], v[5:6] -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo -; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[1:2], v[3:4] -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo -; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[1:2], v[5:6] -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo -; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v1, -1, -1 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[1:2], v[3:4] -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032_DPP-NEXT: v_readlane_b32 s3, v2, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 -; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s6, v1, 15 +; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, 0, s2 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX1032_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo +; GFX1032_DPP-NEXT: v_readlane_b32 s3, v4, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v3, 15 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v5, s3, 16 -; GFX1032_DPP-NEXT: v_writelane_b32 v4, s6, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v2, s3, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v1, s6, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 -; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB23_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, s1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, s0 -; GFX1032_DPP-NEXT: ds_max_rtn_i64 v[7:8], v0, v[7:8] +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1032_DPP-NEXT: ds_max_rtn_i64 v[9:10], v0, v[9:10] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB23_2: @@ -11987,228 +12055,227 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032_DPP-NEXT: s_mov_b32 null, 0 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s5, v8 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v7 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v4 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v5 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s5, v10 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v9 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[4:5], v[7:8] -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc_lo -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc_lo +; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[4:5], v[9:10] +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc_lo ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: max_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, 0, s[0:1] -; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] -; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, 0, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[3:4] -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6] -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[3:4] -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6] -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 +; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v1, -1, -1 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[3:4] -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v4, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v3, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, s2 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s3 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s3 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[3:4] -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 15 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s2, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v4, s3, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s8, v2, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s3, v2, 63 -; GFX1164_DPP-NEXT: v_readlane_b32 s9, v1, 47 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s6, 32 -; GFX1164_DPP-NEXT: v_writelane_b32 v4, s7, 32 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v4, 15 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v3, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s2, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s3, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v3, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s6, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s7, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s8, 48 -; GFX1164_DPP-NEXT: v_writelane_b32 v4, s9, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s8, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] -; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1164_DPP-NEXT: s_mov_b64 s[6:7], exec +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB23_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s0 -; GFX1164_DPP-NEXT: ds_max_rtn_i64 v[7:8], v0, v[7:8] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1164_DPP-NEXT: ds_max_rtn_i64 v[9:10], v0, v[9:10] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB23_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s5, v8 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v7 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v4 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v5 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s5, v10 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v9 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[7:8] -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc +; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[9:10] +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0 +; GFX1164_DPP-NEXT: buffer_store_b64 v[9:10], off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: max_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, 0, s2 -; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s2 -; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[1:2], v[3:4] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v1, v3, v1 -; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[1:2], v[5:6] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v1, v5, v1 -; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[1:2], v[3:4] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v1, v3, v1 -; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[1:2], v[5:6] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v1, v5, v1 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, 0, s2 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v5, v1 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s2 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_mov_b32 v7, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v5, v1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_mov_b32 v7, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v5, v1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v1, -1, -1 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[1:2], v[3:4] -; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo -; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_cndmask_b32 v1, v3, v1 -; GFX1132_DPP-NEXT: v_readlane_b32 s3, v2, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 +; GFX1132_DPP-NEXT: v_readlane_b32 s3, v4, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v5, s3, 16 -; GFX1132_DPP-NEXT: v_writelane_b32 v4, s6, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v2, s3, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v1, s6, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 -; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1132_DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB23_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0 -; GFX1132_DPP-NEXT: ds_max_rtn_i64 v[7:8], v0, v[7:8] +; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v9, s0 +; GFX1132_DPP-NEXT: ds_max_rtn_i64 v[9:10], v0, v[9:10] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB23_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s5, v8 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v7 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v4 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v5 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s5, v10 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v9 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[4:5], v[7:8] -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc_lo -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc_lo +; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[4:5], v[9:10] +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc_lo +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc_lo ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0 +; GFX1132_DPP-NEXT: buffer_store_b64 v[9:10], off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -13463,54 +13530,54 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-LABEL: min_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v9, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX8_DPP-NEXT: v_bfrev_b32_e32 v2, -2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, v2, 0, s[0:1] ; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v0, s[0:1] -; GFX8_DPP-NEXT: v_bfrev_b32_e32 v6, -2 -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, -1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX8_DPP-NEXT: v_bfrev_b32_e32 v6, -2 -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX8_DPP-NEXT: v_bfrev_b32_e32 v6, -2 -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX8_DPP-NEXT: v_bfrev_b32_e32 v6, -2 -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX8_DPP-NEXT: v_bfrev_b32_e32 v6, -2 -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX8_DPP-NEXT: v_bfrev_b32_e32 v6, -2 -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf @@ -13528,9 +13595,10 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_cbranch_execz .LBB26_2 ; GFX8_DPP-NEXT: ; %bb.1: ; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s2 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 -; GFX8_DPP-NEXT: ds_min_rtn_i64 v[7:8], v9, v[7:8] +; GFX8_DPP-NEXT: ds_min_rtn_i64 v[7:8], v0, v[7:8] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB26_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] @@ -13553,54 +13621,54 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-LABEL: min_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v9, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9_DPP-NEXT: v_bfrev_b32_e32 v2, -2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, v2, 0, s[0:1] ; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v0, s[0:1] -; GFX9_DPP-NEXT: v_bfrev_b32_e32 v6, -2 -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, -1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9_DPP-NEXT: v_bfrev_b32_e32 v6, -2 -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9_DPP-NEXT: v_bfrev_b32_e32 v6, -2 -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9_DPP-NEXT: v_bfrev_b32_e32 v6, -2 -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9_DPP-NEXT: v_bfrev_b32_e32 v6, -2 -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9_DPP-NEXT: v_bfrev_b32_e32 v6, -2 -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf @@ -13618,8 +13686,9 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_cbranch_execz .LBB26_2 ; GFX9_DPP-NEXT: ; %bb.1: ; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s2 -; GFX9_DPP-NEXT: ds_min_rtn_i64 v[7:8], v9, v[7:8] +; GFX9_DPP-NEXT: ds_min_rtn_i64 v[7:8], v0, v[7:8] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB26_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] @@ -13642,92 +13711,92 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-LABEL: min_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fffffff, 0, s[0:1] -; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v4, -2 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1 -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] -; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v6, -2 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, -1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[3:4] -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v4, -2 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6] -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v6, -2 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, -1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[3:4] -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v4, -2 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6] -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v1, -1, -1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[3:4] -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v4, -2 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1 -; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, s2 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s3 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v5, -2 -; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[3:4] -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, -1 -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v2, -2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fffffff, 0, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v0, s[0:1] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1064_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v3, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s3 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 15 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s2, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v4, s3, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s8, v2, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 63 -; GFX1064_DPP-NEXT: v_readlane_b32 s9, v1, 47 -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s6, 32 -; GFX1064_DPP-NEXT: v_writelane_b32 v4, s7, 32 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v4, 15 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v3, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s2, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s3, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v4, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s9, v3, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s8, 48 -; GFX1064_DPP-NEXT: v_writelane_b32 v4, s9, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] -; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1064_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB26_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s0 -; GFX1064_DPP-NEXT: ds_min_rtn_i64 v[7:8], v0, v[7:8] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1064_DPP-NEXT: ds_min_rtn_i64 v[9:10], v0, v[9:10] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB26_2: @@ -13735,84 +13804,84 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064_DPP-NEXT: s_mov_b32 null, 0 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s5, v8 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v7 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v4 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v5 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s5, v10 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v9 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[7:8] -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc +; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[9:10] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: min_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fffffff, 0, s2 -; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v4, -2 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s2 -; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v6, -2 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, -1 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[1:2], v[3:4] -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo -; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v4, -2 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[1:2], v[5:6] -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo -; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v6, -2 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, -1 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[1:2], v[3:4] -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo -; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v4, -2 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[1:2], v[5:6] -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo -; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v1, -1, -1 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v5, -2 -; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[1:2], v[3:4] -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, -1 -; GFX1032_DPP-NEXT: v_readlane_b32 s3, v2, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 -; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s6, v1, 15 +; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v2, -2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fffffff, 0, s2 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v0, s2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX1032_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo +; GFX1032_DPP-NEXT: v_readlane_b32 s3, v4, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v3, 15 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v5, s3, 16 -; GFX1032_DPP-NEXT: v_writelane_b32 v4, s6, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v2, s3, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v1, s6, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 -; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB26_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, s1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, s0 -; GFX1032_DPP-NEXT: ds_min_rtn_i64 v[7:8], v0, v[7:8] +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1032_DPP-NEXT: ds_min_rtn_i64 v[9:10], v0, v[9:10] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB26_2: @@ -13820,228 +13889,227 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032_DPP-NEXT: s_mov_b32 null, 0 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s5, v8 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v7 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v4 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v5 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s5, v10 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v9 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[7:8] -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc_lo -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc_lo +; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[9:10] +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc_lo ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: min_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v2, -2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fffffff, 0, s[0:1] -; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v4, -2 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, -1 -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] -; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v6, -2 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, -1 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fffffff, 0, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v0, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[3:4] -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v4, -2 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6] -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v6, -2 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, -1 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[3:4] -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v4, -2 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6] -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 +; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v1, -1, -1 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[3:4] -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v4, -2 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v4, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v3, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, s2 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s3 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s3 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v5, -2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[3:4] -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, -1 -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 15 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s2, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v4, s3, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s8, v2, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s3, v2, 63 -; GFX1164_DPP-NEXT: v_readlane_b32 s9, v1, 47 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s6, 32 -; GFX1164_DPP-NEXT: v_writelane_b32 v4, s7, 32 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v4, 15 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v3, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s2, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s3, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v3, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s6, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s7, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s8, 48 -; GFX1164_DPP-NEXT: v_writelane_b32 v4, s9, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s8, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] -; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1164_DPP-NEXT: s_mov_b64 s[6:7], exec +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB26_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s0 -; GFX1164_DPP-NEXT: ds_min_rtn_i64 v[7:8], v0, v[7:8] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1164_DPP-NEXT: ds_min_rtn_i64 v[9:10], v0, v[9:10] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB26_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s5, v8 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v7 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v4 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v5 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s5, v10 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v9 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[7:8] -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc +; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[9:10] +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0 +; GFX1164_DPP-NEXT: buffer_store_b64 v[9:10], off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: min_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v2, -2 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fffffff, 0, s2 -; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v4, -2 -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s2 -; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v6, -2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, -1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[1:2], v[3:4] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v1, v3, v1 -; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v4, -2 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, -1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[1:2], v[5:6] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v1, v5, v1 -; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v6, -2 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[1:2], v[3:4] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v1, v3, v1 -; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v4, -2 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, -1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[1:2], v[5:6] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v1, v5, v1 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fffffff, 0, s2 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v5, v1 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v0, s2 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_mov_b32 v7, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v5, v1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_mov_b32 v7, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v5, v1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v1, -1, -1 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v5, -2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[1:2], v[3:4] -; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo -; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, -1 :: v_dual_cndmask_b32 v1, v3, v1 -; GFX1132_DPP-NEXT: v_readlane_b32 s3, v2, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 +; GFX1132_DPP-NEXT: v_readlane_b32 s3, v4, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v5, s3, 16 -; GFX1132_DPP-NEXT: v_writelane_b32 v4, s6, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v2, s3, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v1, s6, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 -; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1132_DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB26_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0 -; GFX1132_DPP-NEXT: ds_min_rtn_i64 v[7:8], v0, v[7:8] +; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v9, s0 +; GFX1132_DPP-NEXT: ds_min_rtn_i64 v[9:10], v0, v[9:10] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB26_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s5, v8 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v7 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v4 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v5 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s5, v10 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v9 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[7:8] -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc_lo -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc_lo +; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[9:10] +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc_lo +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc_lo ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0 +; GFX1132_DPP-NEXT: buffer_store_b64 v[9:10], off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -14360,42 +14428,44 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; ; GFX8_DPP-LABEL: umax_i32_varying: ; GFX8_DPP: ; %bb.0: ; %entry -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] ; GFX8_DPP-NEXT: s_nop 1 -; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 1 -; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 1 -; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 1 -; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 1 -; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_readlane_b32 s2, v1, 63 +; GFX8_DPP-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s2, v2, 63 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB27_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s2 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 -; GFX8_DPP-NEXT: ds_max_rtn_u32 v0, v3, v0 +; GFX8_DPP-NEXT: ds_max_rtn_u32 v0, v0, v3 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB27_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX8_DPP-NEXT: s_mov_b32 s2, -1 ; GFX8_DPP-NEXT: v_max_u32_e32 v0, s4, v0 @@ -14405,41 +14475,43 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; ; GFX9_DPP-LABEL: umax_i32_varying: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] ; GFX9_DPP-NEXT: s_nop 1 -; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 -; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 -; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 -; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 -; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s2, v1, 63 +; GFX9_DPP-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s2, v2, 63 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB27_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s2 -; GFX9_DPP-NEXT: ds_max_rtn_u32 v0, v3, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s2 +; GFX9_DPP-NEXT: ds_max_rtn_u32 v0, v0, v3 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB27_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX9_DPP-NEXT: s_mov_b32 s2, -1 ; GFX9_DPP-NEXT: v_max_u32_e32 v0, s4, v0 @@ -14473,7 +14545,6 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s3, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -14483,9 +14554,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB27_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s6 ; GFX1064_DPP-NEXT: s_mov_b32 s3, s6 -; GFX1064_DPP-NEXT: ds_max_rtn_u32 v0, v4, v0 +; GFX1064_DPP-NEXT: ds_max_rtn_u32 v0, v0, v4 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB27_2: @@ -14516,7 +14588,6 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 @@ -14527,8 +14598,9 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB27_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s0 -; GFX1032_DPP-NEXT: ds_max_rtn_u32 v0, v4, v0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s0 +; GFX1032_DPP-NEXT: ds_max_rtn_u32 v0, v0, v4 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB27_2: @@ -14576,9 +14648,8 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s2, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -14588,9 +14659,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB27_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s6 ; GFX1164_DPP-NEXT: s_mov_b32 s3, s6 -; GFX1164_DPP-NEXT: ds_max_rtn_u32 v0, v4, v0 +; GFX1164_DPP-NEXT: ds_max_rtn_u32 v0, v0, v4 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB27_2: @@ -14627,7 +14699,6 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 @@ -14638,8 +14709,9 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB27_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s0 -; GFX1132_DPP-NEXT: ds_max_rtn_u32 v0, v4, v0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s0 +; GFX1132_DPP-NEXT: ds_max_rtn_u32 v0, v0, v4 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB27_2: @@ -15285,274 +15357,278 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; ; GFX8_DPP-LABEL: umax_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry -; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v5 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s[0:1] -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v6, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v6 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s[0:1] +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] +; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] +; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] +; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] +; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] +; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX8_DPP-NEXT: v_readlane_b32 s3, v2, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s2, v1, 63 -; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX8_DPP-NEXT: v_readlane_b32 s3, v3, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s2, v4, 63 +; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v4 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; GFX8_DPP-NEXT: ; implicit-def: $vgpr5_vgpr6 +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB29_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s3 -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, s2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s2 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 -; GFX8_DPP-NEXT: ds_max_rtn_u64 v[5:6], v7, v[5:6] +; GFX8_DPP-NEXT: ds_max_rtn_u64 v[6:7], v0, v[6:7] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB29_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v6 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[5:6] +; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v6 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[6:7] ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s5 -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 ; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX8_DPP-NEXT: s_mov_b32 s2, -1 -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0 +; GFX8_DPP-NEXT: buffer_store_dwordx2 v[6:7], off, s[0:3], 0 ; GFX8_DPP-NEXT: s_endpgm ; ; GFX9_DPP-LABEL: umax_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v5 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s[0:1] -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v6, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v6 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s[0:1] +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] +; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] +; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] +; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] +; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] +; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_DPP-NEXT: v_readlane_b32 s3, v2, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s2, v1, 63 -; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX9_DPP-NEXT: v_readlane_b32 s3, v3, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s2, v4, 63 +; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v4 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; GFX9_DPP-NEXT: ; implicit-def: $vgpr5_vgpr6 +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB29_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s3 -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, s2 -; GFX9_DPP-NEXT: ds_max_rtn_u64 v[5:6], v7, v[5:6] +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s2 +; GFX9_DPP-NEXT: ds_max_rtn_u64 v[6:7], v0, v[6:7] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB29_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v6 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[5:6] +; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v7 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v6 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[6:7] ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s5 -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 ; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX9_DPP-NEXT: s_mov_b32 s2, -1 -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0 +; GFX9_DPP-NEXT: buffer_store_dwordx2 v[6:7], off, s[0:3], 0 ; GFX9_DPP-NEXT: s_endpgm ; ; GFX1064_DPP-LABEL: umax_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s[0:1] -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6] -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6] -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v1, -1, -1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, s2 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s3 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, 0, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1064_DPP-NEXT: v_permlanex16_b32 v1, v4, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v7, v3, -1, -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v3, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s3 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 15 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s2, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v4, s3, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s8, v2, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 63 -; GFX1064_DPP-NEXT: v_readlane_b32 s9, v1, 47 -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s6, 32 -; GFX1064_DPP-NEXT: v_writelane_b32 v4, s7, 32 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v4, 15 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v3, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s2, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s3, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v4, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s9, v3, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s8, 48 -; GFX1064_DPP-NEXT: v_writelane_b32 v4, s9, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] -; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1064_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB29_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s0 -; GFX1064_DPP-NEXT: ds_max_rtn_u64 v[7:8], v0, v[7:8] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1064_DPP-NEXT: ds_max_rtn_u64 v[9:10], v0, v[9:10] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB29_2: @@ -15560,84 +15636,84 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064_DPP-NEXT: s_mov_b32 null, 0 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s5, v8 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v7 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v4 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v5 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s5, v10 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v9 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[7:8] -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc +; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[9:10] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: umax_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s2 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s2 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[1:2], v[3:4] -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[1:2], v[5:6] -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[1:2], v[3:4] -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[1:2], v[5:6] -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo -; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v1, -1, -1 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[1:2], v[3:4] -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032_DPP-NEXT: v_readlane_b32 s3, v2, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 -; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s6, v1, 15 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, 0, s2 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX1032_DPP-NEXT: v_permlanex16_b32 v1, v4, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v7, v3, -1, -1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo +; GFX1032_DPP-NEXT: v_readlane_b32 s3, v4, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v3, 15 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v5, s3, 16 -; GFX1032_DPP-NEXT: v_writelane_b32 v4, s6, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v2, s3, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v1, s6, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 -; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB29_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, s1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, s0 -; GFX1032_DPP-NEXT: ds_max_rtn_u64 v[7:8], v0, v[7:8] +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1032_DPP-NEXT: ds_max_rtn_u64 v[9:10], v0, v[9:10] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB29_2: @@ -15645,222 +15721,233 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032_DPP-NEXT: s_mov_b32 null, 0 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s5, v8 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v7 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v4 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v5 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s5, v10 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v9 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[7:8] -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc_lo -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc_lo +; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc_lo ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: umax_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, 0, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6] -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6] -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 +; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1164_DPP-NEXT: v_permlanex16_b32 v1, v4, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v1, -1, -1 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v3, -1, -1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v4, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v3, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, s2 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s3 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s3 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 15 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s2, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v4, s3, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s8, v2, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s3, v2, 63 -; GFX1164_DPP-NEXT: v_readlane_b32 s9, v1, 47 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s6, 32 -; GFX1164_DPP-NEXT: v_writelane_b32 v4, s7, 32 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v4, 15 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v3, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s2, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s3, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v3, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s6, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s7, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s8, 48 -; GFX1164_DPP-NEXT: v_writelane_b32 v4, s9, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s8, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] -; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1164_DPP-NEXT: s_mov_b64 s[6:7], exec +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB29_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s0 -; GFX1164_DPP-NEXT: ds_max_rtn_u64 v[7:8], v0, v[7:8] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1164_DPP-NEXT: ds_max_rtn_u64 v[9:10], v0, v[9:10] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB29_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s5, v8 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v7 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v4 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v5 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s5, v10 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v9 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[7:8] -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc +; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[9:10] +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0 +; GFX1164_DPP-NEXT: buffer_store_b64 v[9:10], off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: umax_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s2 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, 0 -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s2 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v5, 0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[1:2], v[3:4] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v1, v3, v1 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, 0 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[1:2], v[5:6] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v1, v5, v1 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v5, 0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[1:2], v[3:4] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v1, v3, v1 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, 0 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[1:2], v[5:6] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v1, v5, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s2 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, 0, s2 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v5, v3 :: v_dual_cndmask_b32 v4, v6, v4 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v1, -1, -1 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v1, v4, -1, -1 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v3, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[1:2], v[3:4] -; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_cndmask_b32 v2, v4, v2 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_cndmask_b32 v1, v3, v1 -; GFX1132_DPP-NEXT: v_readlane_b32 s3, v2, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 15 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v5, v3 :: v_dual_cndmask_b32 v4, v6, v4 +; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_readlane_b32 s3, v4, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v5, s3, 16 -; GFX1132_DPP-NEXT: v_writelane_b32 v4, s6, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v2, s3, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v1, s6, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 -; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1132_DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB29_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0 -; GFX1132_DPP-NEXT: ds_max_rtn_u64 v[7:8], v0, v[7:8] +; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v9, s0 +; GFX1132_DPP-NEXT: ds_max_rtn_u64 v[9:10], v0, v[9:10] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB29_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s5, v8 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v7 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v4 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v5 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s5, v10 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v9 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[7:8] -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc_lo -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc_lo +; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc_lo +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc_lo ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0 +; GFX1132_DPP-NEXT: buffer_store_b64 v[9:10], off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -17105,60 +17192,60 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-LABEL: umin_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v6, exec_lo, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v6 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, -1, 0, s[0:1] ; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, -1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, -1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5] ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, -1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5] ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, -1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5] ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, -1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5] ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, -1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5] ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, -1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5] ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX8_DPP-NEXT: v_readlane_b32 s3, v3, 63 ; GFX8_DPP-NEXT: v_readlane_b32 s2, v4, 63 ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v3 wave_shr:1 row_mask:0xf bank_mask:0xf @@ -17170,9 +17257,10 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_cbranch_execz .LBB32_2 ; GFX8_DPP-NEXT: ; %bb.1: ; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s2 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 -; GFX8_DPP-NEXT: ds_min_rtn_u64 v[6:7], v8, v[6:7] +; GFX8_DPP-NEXT: ds_min_rtn_u64 v[6:7], v0, v[6:7] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB32_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] @@ -17195,60 +17283,60 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-LABEL: umin_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v6, exec_lo, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v6 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, -1, 0, s[0:1] ; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, -1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, -1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5] ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, -1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5] ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, -1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5] ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, -1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5] ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, -1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5] ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, -1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5] ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX9_DPP-NEXT: v_readlane_b32 s3, v3, 63 ; GFX9_DPP-NEXT: v_readlane_b32 s2, v4, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v3 wave_shr:1 row_mask:0xf bank_mask:0xf @@ -17260,8 +17348,9 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_cbranch_execz .LBB32_2 ; GFX9_DPP-NEXT: ; %bb.1: ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s2 -; GFX9_DPP-NEXT: ds_min_rtn_u64 v[6:7], v8, v[6:7] +; GFX9_DPP-NEXT: ds_min_rtn_u64 v[6:7], v0, v[6:7] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB32_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] @@ -17284,92 +17373,92 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-LABEL: umin_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, -1, 0, s[0:1] -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, -1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1 -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, -1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, -1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[3:4] -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, -1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6] -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, -1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, -1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[3:4] -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, -1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6] -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 -; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v1, -1, -1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[3:4] -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, -1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1 -; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, s2 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s3 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, -1 -; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[3:4] -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, -1 -; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, -1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, -1, 0, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v0, s[0:1] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1064_DPP-NEXT: v_permlanex16_b32 v1, v4, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v7, v3, -1, -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v3, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s3 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 15 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s2, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v4, s3, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s8, v2, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 63 -; GFX1064_DPP-NEXT: v_readlane_b32 s9, v1, 47 -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s6, 32 -; GFX1064_DPP-NEXT: v_writelane_b32 v4, s7, 32 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v4, 15 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v3, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s2, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s3, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v4, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s9, v3, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s8, 48 -; GFX1064_DPP-NEXT: v_writelane_b32 v4, s9, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] -; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1064_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB32_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s0 -; GFX1064_DPP-NEXT: ds_min_rtn_u64 v[7:8], v0, v[7:8] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1064_DPP-NEXT: ds_min_rtn_u64 v[9:10], v0, v[9:10] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB32_2: @@ -17377,84 +17466,84 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064_DPP-NEXT: s_mov_b32 null, 0 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s5, v8 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v7 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v4 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v5 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s5, v10 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v9 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[7:8] -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc +; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[9:10] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: umin_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, -1, 0, s2 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, -1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s2 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, -1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, -1 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[1:2], v[3:4] -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, -1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[1:2], v[5:6] -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, -1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, -1 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[1:2], v[3:4] -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, -1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[1:2], v[5:6] -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo -; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 -; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v1, -1, -1 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, -1 -; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[1:2], v[3:4] -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo -; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, -1 -; GFX1032_DPP-NEXT: v_readlane_b32 s3, v2, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 -; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s6, v1, 15 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, -1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, -1, 0, s2 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v0, s2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX1032_DPP-NEXT: v_permlanex16_b32 v1, v4, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v7, v3, -1, -1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo +; GFX1032_DPP-NEXT: v_readlane_b32 s3, v4, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v3, 15 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v5, s3, 16 -; GFX1032_DPP-NEXT: v_writelane_b32 v4, s6, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v2, s3, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v1, s6, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 -; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB32_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, s1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, s0 -; GFX1032_DPP-NEXT: ds_min_rtn_u64 v[7:8], v0, v[7:8] +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1032_DPP-NEXT: ds_min_rtn_u64 v[9:10], v0, v[9:10] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB32_2: @@ -17462,222 +17551,233 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032_DPP-NEXT: s_mov_b32 null, 0 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s5, v8 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v7 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v4 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v5 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s5, v10 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v9 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[7:8] -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc_lo -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc_lo +; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc_lo ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: umin_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, -1 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, -1, 0, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, -1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, -1 -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, -1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, -1 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, -1, 0, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v0, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[3:4] -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, -1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6] -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, -1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, -1 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[3:4] -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, -1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6] -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 +; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1164_DPP-NEXT: v_permlanex16_b32 v1, v4, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v1, -1, -1 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v3, -1, -1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[3:4] -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, -1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v4, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v3, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, s2 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s3 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s3 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[3:4] -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, -1 -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 15 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s2, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v4, s3, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s8, v2, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s3, v2, 63 -; GFX1164_DPP-NEXT: v_readlane_b32 s9, v1, 47 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s6, 32 -; GFX1164_DPP-NEXT: v_writelane_b32 v4, s7, 32 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v4, 15 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v3, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s2, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s3, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v3, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s6, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s7, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s8, 48 -; GFX1164_DPP-NEXT: v_writelane_b32 v4, s9, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s8, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] -; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1164_DPP-NEXT: s_mov_b64 s[6:7], exec +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB32_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s0 -; GFX1164_DPP-NEXT: ds_min_rtn_u64 v[7:8], v0, v[7:8] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1164_DPP-NEXT: ds_min_rtn_u64 v[9:10], v0, v[9:10] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB32_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s5, v8 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v7 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v4 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v5 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s5, v10 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v9 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[7:8] -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc +; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[9:10] +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0 +; GFX1164_DPP-NEXT: buffer_store_b64 v[9:10], off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: umin_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, -1 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, -1, 0, s2 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, -1 :: v_dual_mov_b32 v3, -1 -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s2 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, -1 :: v_dual_mov_b32 v5, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[1:2], v[3:4] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v1, v3, v1 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, -1 :: v_dual_mov_b32 v3, -1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[1:2], v[5:6] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v1, v5, v1 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, -1 :: v_dual_mov_b32 v5, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[1:2], v[3:4] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v1, v3, v1 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, -1 :: v_dual_mov_b32 v3, -1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[1:2], v[5:6] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v1, v5, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v0, s2 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, -1, 0, s2 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v5, v3 :: v_dual_cndmask_b32 v4, v6, v4 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v1, -1, -1 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v1, v4, -1, -1 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v3, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[1:2], v[3:4] -; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, -1 :: v_dual_cndmask_b32 v2, v4, v2 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, -1 :: v_dual_cndmask_b32 v1, v3, v1 -; GFX1132_DPP-NEXT: v_readlane_b32 s3, v2, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 15 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v5, v3 :: v_dual_cndmask_b32 v4, v6, v4 +; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_readlane_b32 s3, v4, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v5, s3, 16 -; GFX1132_DPP-NEXT: v_writelane_b32 v4, s6, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v2, s3, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v1, s6, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 -; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1132_DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB32_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0 -; GFX1132_DPP-NEXT: ds_min_rtn_u64 v[7:8], v0, v[7:8] +; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v9, s0 +; GFX1132_DPP-NEXT: ds_min_rtn_u64 v[9:10], v0, v[9:10] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB32_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s5, v8 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v7 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v4 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v5 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s5, v10 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v9 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[7:8] -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc_lo -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc_lo +; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc_lo +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc_lo ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0 +; GFX1132_DPP-NEXT: buffer_store_b64 v[9:10], off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/cse-convergent.ll b/llvm/test/CodeGen/AMDGPU/cse-convergent.ll index 7aca63d34f51b..4eb33c70210d5 100644 --- a/llvm/test/CodeGen/AMDGPU/cse-convergent.ll +++ b/llvm/test/CodeGen/AMDGPU/cse-convergent.ll @@ -8,31 +8,33 @@ define i32 @test(i32 %val, i32 %cond) { ; GCN-NEXT: s_xor_saveexec_b32 s4, -1 ; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt_depctr 0xffe3 ; GCN-NEXT: s_mov_b32 exec_lo, s4 ; GCN-NEXT: s_or_saveexec_b32 s4, -1 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: v_cndmask_b32_e64 v3, 0, v0, s4 -; GCN-NEXT: v_mov_b32_dpp v2, v3 row_xmask:1 row_mask:0xf bank_mask:0xf -; GCN-NEXT: s_mov_b32 exec_lo, s4 -; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: v_mov_b32_e32 v4, v2 +; GCN-NEXT: v_mov_b32_dpp v4, v3 row_xmask:1 row_mask:0xf bank_mask:0xf +; GCN-NEXT: s_mov_b32 exec_lo, s4 +; GCN-NEXT: v_mov_b32_e32 v5, v4 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GCN-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GCN-NEXT: ; %bb.1: ; %if ; GCN-NEXT: s_or_saveexec_b32 s5, -1 -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: v_cndmask_b32_e64 v3, 0, v0, s5 -; GCN-NEXT: v_mov_b32_dpp v2, v3 row_xmask:1 row_mask:0xf bank_mask:0xf +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, v0, s5 +; GCN-NEXT: v_mov_b32_dpp v3, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GCN-NEXT: s_mov_b32 exec_lo, s5 -; GCN-NEXT: v_mov_b32_e32 v5, v2 +; GCN-NEXT: v_mov_b32_e32 v2, v3 ; GCN-NEXT: ; %bb.2: ; %end ; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GCN-NEXT: v_add_nc_u32_e32 v0, v4, v5 +; GCN-NEXT: v_add_nc_u32_e32 v0, v5, v2 ; GCN-NEXT: s_xor_saveexec_b32 s4, -1 -; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: s_clause 0x2 ; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 ; GCN-NEXT: s_waitcnt_depctr 0xffe3 ; GCN-NEXT: s_mov_b32 exec_lo, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index fbe06b3651b06..1a2c45a07b287 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -817,23 +817,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 @@ -847,14 +847,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_cbranch_execz .LBB1_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] +; GFX9-DPP-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-DPP-NEXT: .LBB1_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_add_f32_e32 v0, s4, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -897,17 +898,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1] -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v4, v4, v6 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 @@ -924,15 +925,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB1_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1] +; GFX1064-DPP-NEXT: global_load_dword v1, v0, s[0:1] ; GFX1064-DPP-NEXT: .LBB1_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v7, v[0:1], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 @@ -974,17 +976,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0 -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v4, v4, v6 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 @@ -996,14 +998,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB1_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1] +; GFX1032-DPP-NEXT: global_load_dword v1, v0, s[0:1] ; GFX1032-DPP-NEXT: .LBB1_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v7, v[0:1], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 @@ -1035,44 +1038,44 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1] -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX1164-DPP-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v5 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1] +; GFX1164-DPP-NEXT: global_atomic_add_f32 v5, v0, s[0:1] ; GFX1164-DPP-NEXT: .LBB1_2: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -1098,37 +1101,36 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0 -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX1132-DPP-NEXT: v_dual_add_f32 v2, v2, v3 :: v_dual_mov_b32 v3, v1 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v5 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1] +; GFX1132-DPP-NEXT: global_atomic_add_f32 v5, v0, s[0:1] ; GFX1132-DPP-NEXT: .LBB1_2: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call float @div.float.value() @@ -2037,23 +2039,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 @@ -2067,14 +2069,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_cbranch_execz .LBB3_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] +; GFX9-DPP-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-DPP-NEXT: .LBB3_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_add_f32_e32 v0, s4, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -2117,17 +2120,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1] -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v4, v4, v6 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 @@ -2144,15 +2147,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB3_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1] +; GFX1064-DPP-NEXT: global_load_dword v1, v0, s[0:1] ; GFX1064-DPP-NEXT: .LBB3_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v7, v[0:1], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 @@ -2194,17 +2198,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0 -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v4, v4, v6 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 @@ -2216,14 +2220,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB3_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1] +; GFX1032-DPP-NEXT: global_load_dword v1, v0, s[0:1] ; GFX1032-DPP-NEXT: .LBB3_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v7, v[0:1], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 @@ -2255,44 +2260,44 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1] -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX1164-DPP-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v5 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1] +; GFX1164-DPP-NEXT: global_atomic_add_f32 v5, v0, s[0:1] ; GFX1164-DPP-NEXT: .LBB3_2: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -2318,37 +2323,36 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0 -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_add_f32 v2, v2, v3 :: v_dual_mov_b32 v3, v1 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX1132-DPP-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v5 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1] +; GFX1132-DPP-NEXT: global_atomic_add_f32 v5, v0, s[0:1] ; GFX1132-DPP-NEXT: .LBB3_2: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call float @div.float.value() strictfp @@ -3317,23 +3321,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 @@ -3347,14 +3351,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_cbranch_execz .LBB5_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] +; GFX9-DPP-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-DPP-NEXT: .LBB5_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_add_f32_e32 v0, s4, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -3397,17 +3402,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1] -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v4, v4, v6 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 @@ -3424,15 +3429,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB5_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1] +; GFX1064-DPP-NEXT: global_load_dword v1, v0, s[0:1] ; GFX1064-DPP-NEXT: .LBB5_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v7, v[0:1], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 @@ -3474,17 +3480,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0 -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v4, v4, v6 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 @@ -3496,14 +3502,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB5_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1] +; GFX1032-DPP-NEXT: global_load_dword v1, v0, s[0:1] ; GFX1032-DPP-NEXT: .LBB5_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v7, v[0:1], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 @@ -3535,44 +3542,44 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1] -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX1164-DPP-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v5 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1] +; GFX1164-DPP-NEXT: global_atomic_add_f32 v5, v0, s[0:1] ; GFX1164-DPP-NEXT: .LBB5_2: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -3598,37 +3605,36 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0 -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX1132-DPP-NEXT: v_dual_add_f32 v2, v2, v3 :: v_dual_mov_b32 v3, v1 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v5 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1] +; GFX1132-DPP-NEXT: global_atomic_add_f32 v5, v0, s[0:1] ; GFX1132-DPP-NEXT: .LBB5_2: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call float @div.float.value() @@ -4093,23 +4099,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 @@ -4123,14 +4129,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] +; GFX9-DPP-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-DPP-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_add_f32_e32 v0, s4, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -4173,17 +4180,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1] -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v4, v4, v6 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 @@ -4200,15 +4207,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1] +; GFX1064-DPP-NEXT: global_load_dword v1, v0, s[0:1] ; GFX1064-DPP-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v7, v[0:1], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 @@ -4250,17 +4258,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0 -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v4, v4, v6 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 @@ -4272,14 +4280,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1] +; GFX1032-DPP-NEXT: global_load_dword v1, v0, s[0:1] ; GFX1032-DPP-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v7, v[0:1], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 @@ -4311,44 +4320,44 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1] -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX1164-DPP-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v5 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1] +; GFX1164-DPP-NEXT: global_atomic_add_f32 v5, v0, s[0:1] ; GFX1164-DPP-NEXT: .LBB6_2: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -4374,37 +4383,36 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0 -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_add_f32 v2, v2, v3 :: v_dual_mov_b32 v3, v1 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX1132-DPP-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v5 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1] +; GFX1132-DPP-NEXT: global_atomic_add_f32 v5, v0, s[0:1] ; GFX1132-DPP-NEXT: .LBB6_2: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call float @div.float.value() strictfp @@ -5398,23 +5406,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 @@ -5428,14 +5436,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] +; GFX9-DPP-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_add_f32_e32 v0, s4, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -5478,17 +5487,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1] -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v4, v4, v6 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 @@ -5505,15 +5514,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1] +; GFX1064-DPP-NEXT: global_load_dword v1, v0, s[0:1] ; GFX1064-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v7, v[0:1], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 @@ -5555,17 +5565,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0 -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v4, v4, v6 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 @@ -5577,14 +5587,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1] +; GFX1032-DPP-NEXT: global_load_dword v1, v0, s[0:1] ; GFX1032-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v7, v[0:1], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 @@ -5616,53 +5627,54 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1] -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX1164-DPP-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v5 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_load_b32 v5, v6, s[0:1] +; GFX1164-DPP-NEXT: global_load_b32 v6, v5, s[0:1] ; GFX1164-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f32_e32 v4, v5, v0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc +; GFX1164-DPP-NEXT: v_add_f32_e32 v5, v6, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v5, v7, v[5:6], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v5 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] @@ -5692,46 +5704,46 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0 -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_add_f32 v2, v2, v3 :: v_dual_mov_b32 v3, v1 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX1132-DPP-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v5 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_load_b32 v5, v6, s[0:1] +; GFX1132-DPP-NEXT: global_load_b32 v6, v5, s[0:1] ; GFX1132-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f32_e32 v4, v5, v0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc +; GFX1132-DPP-NEXT: v_add_f32_e32 v5, v6, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v5, v7, v[5:6], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, v5 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 @@ -7389,54 +7401,57 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v2 +; GFX9-DPP-NEXT: v_mov_b32_e32 v41, v1 ; GFX9-DPP-NEXT: s_mov_b32 s33, s10 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-DPP-NEXT: v_mov_b32_e32 v42, v0 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v41 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] +; GFX9-DPP-NEXT: v_or3_b32 v31, v42, v1, v0 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-DPP-NEXT: s_mov_b32 s12, s43 ; GFX9-DPP-NEXT: s_mov_b32 s13, s42 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1 ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v13, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v13, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v13, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v13, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v13, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf @@ -7446,18 +7461,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_readlane_b32 s45, v9, 63 ; GFX9-DPP-NEXT: v_readlane_b32 s44, v8, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[46:47], s[36:37], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[48:49], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[46:47] @@ -7471,27 +7486,29 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] ; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4 ; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v41 +; GFX9-DPP-NEXT: v_or3_b32 v31, v42, v1, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] ; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[52:55], 0 offset:12 ; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-DPP-NEXT: s_mov_b32 s12, s43 ; GFX9-DPP-NEXT: s_mov_b32 s13, s42 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s46 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s47 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[52:55], 0 @@ -7521,19 +7538,21 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v2 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v42, v0 ; GFX1064-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v41 ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: v_or3_b32 v31, v42, v1, v0 ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] @@ -7545,24 +7564,24 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1] ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1064-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] @@ -7573,49 +7592,51 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_add_f64 v[8:9], s[2:3], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v42, v9 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v43, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v44, v9 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1064-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[43:44] ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v5, 10, v41 ; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: v_or3_b32 v31, v42, v5, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 ; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 @@ -7646,19 +7667,21 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v2 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v0 ; GFX1032-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v41 ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: v_or3_b32 v31, v42, v1, v0 ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] @@ -7670,71 +7693,73 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s0 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v43, v8 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v44, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1032-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[43:44] ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v5, 10, v41 ; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: v_or3_b32 v31, v42, v5, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 ; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 @@ -7779,30 +7804,31 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1] ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 @@ -7816,16 +7842,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] @@ -7842,13 +7868,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -7902,46 +7928,45 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s0 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] ; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v41, v8 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 @@ -7957,10 +7982,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -9003,35 +9029,36 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf @@ -9041,19 +9068,19 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 ; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] @@ -9061,12 +9088,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], s[0:1] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v0, v[9:12], s[2:3] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[11:12] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v1 ; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v0 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_2 ; GFX9-DPP-NEXT: .LBB12_3: @@ -9106,24 +9134,24 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s[0:1] ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] @@ -9134,27 +9162,28 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dwordx2 v[9:10], v2, s[0:1] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] ; GFX1064-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[7:8], v[9:10], v[0:1] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[7:8], v2, v[7:10], s[0:1] glc +; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[9:10] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, v7 +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_2 @@ -9195,49 +9224,50 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s0 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dwordx2 v[9:10], v2, s[0:1] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] ; GFX1032-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[7:8], v[9:10], v[0:1] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[7:8], v2, v[7:10], s[0:1] glc +; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[9:10] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, v7 +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_2 @@ -9268,30 +9298,31 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s[0:1] ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 @@ -9302,31 +9333,33 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v6 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] +; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v8, s[0:1] ; GFX1164-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f64 v[6:7], v[8:9], v[0:1] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[8:9] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, v7 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, v6 +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] @@ -9358,56 +9391,56 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s0 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] ; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v6, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, v3 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v6 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] +; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v8, s[0:1] ; GFX1132-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[6:7], v[8:9], v[0:1] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[8:9] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 @@ -10436,35 +10469,36 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf @@ -10474,19 +10508,19 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 ; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB14_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] @@ -10494,12 +10528,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], s[0:1] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v0, v[9:12], s[2:3] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[11:12] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v1 ; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v0 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB14_2 ; GFX9-DPP-NEXT: .LBB14_3: @@ -10539,24 +10574,24 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s[0:1] ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] @@ -10567,27 +10602,28 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB14_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dwordx2 v[9:10], v2, s[0:1] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] ; GFX1064-DPP-NEXT: .LBB14_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[7:8], v[9:10], v[0:1] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[7:8], v2, v[7:10], s[0:1] glc +; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[9:10] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, v7 +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB14_2 @@ -10628,49 +10664,50 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s0 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB14_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dwordx2 v[9:10], v2, s[0:1] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] ; GFX1032-DPP-NEXT: .LBB14_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[7:8], v[9:10], v[0:1] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[7:8], v2, v[7:10], s[0:1] glc +; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[9:10] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, v7 +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB14_2 @@ -10701,30 +10738,31 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s[0:1] ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 @@ -10735,31 +10773,33 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v6 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB14_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] +; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v8, s[0:1] ; GFX1164-DPP-NEXT: .LBB14_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f64 v[6:7], v[8:9], v[0:1] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[8:9] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, v7 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, v6 +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] @@ -10791,56 +10831,56 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s0 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] ; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v6, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, v3 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v6 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB14_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] +; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v8, s[0:1] ; GFX1132-DPP-NEXT: .LBB14_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[6:7], v[8:9], v[0:1] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[8:9] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 @@ -11351,35 +11391,36 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf @@ -11389,19 +11430,19 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 ; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB15_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] @@ -11409,12 +11450,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], s[0:1] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v0, v[9:12], s[2:3] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[11:12] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v1 ; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v0 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB15_2 ; GFX9-DPP-NEXT: .LBB15_3: @@ -11454,24 +11496,24 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s[0:1] ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] @@ -11482,27 +11524,28 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB15_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dwordx2 v[9:10], v2, s[0:1] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] ; GFX1064-DPP-NEXT: .LBB15_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[7:8], v[9:10], v[0:1] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[7:8], v2, v[7:10], s[0:1] glc +; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[9:10] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, v7 +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB15_2 @@ -11543,49 +11586,50 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s0 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB15_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dwordx2 v[9:10], v2, s[0:1] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] ; GFX1032-DPP-NEXT: .LBB15_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[7:8], v[9:10], v[0:1] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[7:8], v2, v[7:10], s[0:1] glc +; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[9:10] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, v7 +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB15_2 @@ -11616,30 +11660,31 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s[0:1] ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 @@ -11650,31 +11695,33 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v6 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB15_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] +; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v8, s[0:1] ; GFX1164-DPP-NEXT: .LBB15_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f64 v[6:7], v[8:9], v[0:1] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[8:9] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, v7 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, v6 +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] @@ -11706,56 +11753,56 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s0 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] ; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v6, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, v3 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v6 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB15_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] +; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v8, s[0:1] ; GFX1132-DPP-NEXT: .LBB15_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[6:7], v[8:9], v[0:1] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[8:9] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 @@ -13445,54 +13492,57 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v2 +; GFX9-DPP-NEXT: v_mov_b32_e32 v41, v1 ; GFX9-DPP-NEXT: s_mov_b32 s33, s10 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-DPP-NEXT: v_mov_b32_e32 v42, v0 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v41 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] +; GFX9-DPP-NEXT: v_or3_b32 v31, v42, v1, v0 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-DPP-NEXT: s_mov_b32 s12, s43 ; GFX9-DPP-NEXT: s_mov_b32 s13, s42 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1 ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v13, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v13, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v13, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v13, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v13, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf @@ -13502,18 +13552,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_readlane_b32 s45, v9, 63 ; GFX9-DPP-NEXT: v_readlane_b32 s44, v8, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[46:47], s[36:37], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[48:49], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[46:47] @@ -13527,27 +13577,29 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] ; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4 ; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v41 +; GFX9-DPP-NEXT: v_or3_b32 v31, v42, v1, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] ; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[52:55], 0 offset:12 ; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-DPP-NEXT: s_mov_b32 s12, s43 ; GFX9-DPP-NEXT: s_mov_b32 s13, s42 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s46 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s47 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[52:55], 0 @@ -13577,19 +13629,21 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v2 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v42, v0 ; GFX1064-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v41 ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: v_or3_b32 v31, v42, v1, v0 ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] @@ -13601,24 +13655,24 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1] ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1064-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] @@ -13629,49 +13683,51 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: v_add_f64 v[8:9], s[2:3], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v42, v9 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v43, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v44, v9 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1064-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[43:44] ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v5, 10, v41 ; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: v_or3_b32 v31, v42, v5, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 ; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 @@ -13702,19 +13758,21 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v2 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v0 ; GFX1032-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v41 ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: v_or3_b32 v31, v42, v1, v0 ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] @@ -13726,71 +13784,73 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s0 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v43, v8 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v44, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1032-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[43:44] ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v5, 10, v41 ; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: v_or3_b32 v31, v42, v5, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 ; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 @@ -13835,30 +13895,31 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1] ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 @@ -13872,16 +13933,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] @@ -13898,13 +13959,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -13958,46 +14019,45 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s0 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] ; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v41, v8 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 @@ -14013,10 +14073,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index 7792422291998..debd2489c1922 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -719,28 +719,28 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 @@ -756,17 +756,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_cbranch_execz .LBB1_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] +; GFX9-DPP-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-DPP-NEXT: .LBB1_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e64 v0, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v0, v6 -; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v2, v0 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] @@ -808,22 +809,22 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4 +; GFX1064-DPP-NEXT: v_max_f32_e32 v5, v5, v5 +; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f32_e32 v5, v6, v6 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f32_e32 v5, v6, v6 +; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v4, v3 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4 @@ -881,22 +882,22 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 +; GFX1032-DPP-NEXT: v_max_f32_e32 v5, v5, v5 +; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v5, v6, v6 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v5, v6, v6 +; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v4, v3 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 @@ -936,54 +937,54 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v2, v1 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v3 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v3 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v5 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_atomic_max_f32 v4, v0, s[0:1] +; GFX1164-DPP-NEXT: global_atomic_max_f32 v5, v0, s[0:1] ; GFX1164-DPP-NEXT: .LBB1_2: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -1007,45 +1008,46 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_max_f32 v3, v3, v3 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 +; GFX1132-DPP-NEXT: v_dual_max_f32 v3, v4, v4 :: v_dual_mov_b32 v4, v1 +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v3 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f32_e32 v3, v4, v4 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v2, v1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v5 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_atomic_max_f32 v4, v0, s[0:1] +; GFX1132-DPP-NEXT: global_atomic_max_f32 v5, v0, s[0:1] ; GFX1132-DPP-NEXT: .LBB1_2: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call float @div.float.value() @@ -1758,28 +1760,28 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 @@ -1795,17 +1797,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_cbranch_execz .LBB3_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] +; GFX9-DPP-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-DPP-NEXT: .LBB3_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e64 v0, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v0, v6 -; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v2, v0 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] @@ -1847,22 +1850,22 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4 +; GFX1064-DPP-NEXT: v_max_f32_e32 v5, v5, v5 +; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f32_e32 v5, v6, v6 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f32_e32 v5, v6, v6 +; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v4, v3 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4 @@ -1920,22 +1923,22 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 +; GFX1032-DPP-NEXT: v_max_f32_e32 v5, v5, v5 +; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v5, v6, v6 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v5, v6, v6 +; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v4, v3 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 @@ -1975,54 +1978,54 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v2, v1 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v3 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v3 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v5 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_atomic_max_f32 v4, v0, s[0:1] +; GFX1164-DPP-NEXT: global_atomic_max_f32 v5, v0, s[0:1] ; GFX1164-DPP-NEXT: .LBB3_2: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -2046,45 +2049,46 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_max_f32 v3, v3, v3 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 +; GFX1132-DPP-NEXT: v_dual_max_f32 v3, v4, v4 :: v_dual_mov_b32 v4, v1 +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v3 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f32_e32 v3, v4, v4 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v2, v1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v5 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_atomic_max_f32 v4, v0, s[0:1] +; GFX1132-DPP-NEXT: global_atomic_max_f32 v5, v0, s[0:1] ; GFX1132-DPP-NEXT: .LBB3_2: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call float @div.float.value() @@ -2797,28 +2801,28 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 @@ -2834,17 +2838,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: s_cbranch_execz .LBB5_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] +; GFX9-DPP-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-DPP-NEXT: .LBB5_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e64 v0, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v0, v6 -; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v2, v0 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] @@ -2886,22 +2891,22 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4 +; GFX1064-DPP-NEXT: v_max_f32_e32 v5, v5, v5 +; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f32_e32 v5, v6, v6 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f32_e32 v5, v6, v6 +; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v4, v3 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4 @@ -2959,22 +2964,22 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 +; GFX1032-DPP-NEXT: v_max_f32_e32 v5, v5, v5 +; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v5, v6, v6 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v5, v6, v6 +; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v4, v3 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 @@ -3014,54 +3019,54 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v2, v1 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v3 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v3 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v5 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_atomic_max_f32 v4, v0, s[0:1] +; GFX1164-DPP-NEXT: global_atomic_max_f32 v5, v0, s[0:1] ; GFX1164-DPP-NEXT: .LBB5_2: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -3085,45 +3090,46 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_max_f32 v3, v3, v3 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 +; GFX1132-DPP-NEXT: v_dual_max_f32 v3, v4, v4 :: v_dual_mov_b32 v4, v1 +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v3 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f32_e32 v3, v4, v4 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v2, v1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v5 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_atomic_max_f32 v4, v0, s[0:1] +; GFX1132-DPP-NEXT: global_atomic_max_f32 v5, v0, s[0:1] ; GFX1132-DPP-NEXT: .LBB5_2: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call float @div.float.value() @@ -4766,59 +4772,62 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v2 +; GFX9-DPP-NEXT: v_mov_b32_e32 v41, v1 ; GFX9-DPP-NEXT: s_mov_b32 s33, s10 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-DPP-NEXT: v_mov_b32_e32 v42, v0 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v41 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] +; GFX9-DPP-NEXT: v_or3_b32 v31, v42, v1, v0 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-DPP-NEXT: s_mov_b32 s12, s43 ; GFX9-DPP-NEXT: s_mov_b32 s13, s42 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, 0x7ff80000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] ; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, 0x7ff80000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] ; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, 0x7ff80000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] ; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, 0x7ff80000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] ; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, 0x7ff80000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf @@ -4830,18 +4839,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] ; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_readlane_b32 s45, v9, 63 ; GFX9-DPP-NEXT: v_readlane_b32 s44, v8, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[46:47], s[36:37], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[48:49], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[46:47] @@ -4850,34 +4859,36 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_max_f64 v[3:4], s[44:45], s[44:45] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0 ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_max_f64 v[2:3], v[5:6], v[3:4] +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v41 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0 +; GFX9-DPP-NEXT: v_or3_b32 v31, v42, v4, v0 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:8 ; GFX9-DPP-NEXT: s_mov_b32 s12, s43 ; GFX9-DPP-NEXT: s_mov_b32 s13, s42 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[52:55], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:8 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s46 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s47 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[52:55], 0 @@ -4907,19 +4918,21 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v2 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v42, v0 ; GFX1064-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v41 ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: v_or3_b32 v31, v42, v1, v0 ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] @@ -4931,29 +4944,29 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1] ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1064-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] ; GFX1064-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1064-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1064-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1064-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] ; GFX1064-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] @@ -4967,21 +4980,21 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v42, v9 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v43, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v44, v9 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1064-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[43:44], v[43:44] ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 @@ -4989,29 +5002,31 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 ; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v5, 10, v41 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 ; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_or3_b32 v31, v42, v5, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 @@ -5042,19 +5057,21 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v2 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v0 ; GFX1032-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v41 ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: v_or3_b32 v31, v42, v1, v0 ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] @@ -5066,79 +5083,81 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1032-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] ; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] ; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v43, v8 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v44, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 -; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1032-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[43:44], v[43:44] ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 ; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v5, 10, v41 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 ; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_or3_b32 v31, v42, v5, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 @@ -5183,37 +5202,38 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1] ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] -; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1164-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1164-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 @@ -5229,18 +5249,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 -; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] @@ -5248,8 +5267,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1164-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] @@ -5258,19 +5278,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -5317,80 +5338,82 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_dual_mov_b32 v9, 0x7ff80000 :: v_dual_mov_b32 v8, 0 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] -; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, 0x7ff80000 :: v_dual_mov_b32 v10, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, 0x7ff80000 :: v_dual_mov_b32 v10, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, 0x7ff80000 :: v_dual_mov_b32 v10, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] ; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1132-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6245,40 +6268,41 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000 ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] ; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] ; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] ; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] ; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf @@ -6290,34 +6314,35 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] ; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 ; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] ; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f64 v[1:2], s[0:1], s[0:1] +; GFX9-DPP-NEXT: v_max_f64 v[0:1], s[0:1], s[0:1] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12] -; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[9:10], v[1:2] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc +; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[9:10], v[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v0, v[9:12], s[2:3] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[11:12] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v1 ; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v0 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX9-DPP-NEXT: .LBB9_3: @@ -6357,29 +6382,29 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, 0x7ff80000, v1, s[0:1] ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] -; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] -; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] -; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] ; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] @@ -6393,15 +6418,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_2 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1] ; GFX1064-DPP-NEXT: .LBB9_2: @@ -6441,43 +6466,43 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, 0x7ff80000, v1, s0 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] -; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] -; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] -; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_2 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1] ; GFX1032-DPP-NEXT: .LBB9_2: @@ -6507,37 +6532,38 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, 0x7ff80000, v1, s[0:1] ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0x7ff80000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0x7ff80000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0x7ff80000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 @@ -6551,34 +6577,36 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v6 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] +; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v8, s[0:1] ; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[0:1], v[0:1] ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] +; GFX1164-DPP-NEXT: v_max_f64 v[12:13], v[10:11], v[10:11] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[0:1] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[12:13], v[8:9] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[8:9] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, v7 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, v6 +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] @@ -6609,65 +6637,67 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, 0x7ff80000 :: v_dual_mov_b32 v2, 0 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, 0x7ff80000, v1, s0 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 0x7ff80000 :: v_dual_mov_b32 v4, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 0x7ff80000 :: v_dual_mov_b32 v4, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 0x7ff80000 :: v_dual_mov_b32 v4, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] ; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v6, exec_lo, 0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, v3 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v6 +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] +; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v8, s[0:1] ; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[0:1], v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] +; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[10:11], v[10:11] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[0:1] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc +; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[12:13], v[8:9] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[8:9] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 @@ -8314,59 +8344,62 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v2 +; GFX9-DPP-NEXT: v_mov_b32_e32 v41, v1 ; GFX9-DPP-NEXT: s_mov_b32 s33, s10 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-DPP-NEXT: v_mov_b32_e32 v42, v0 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v41 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] +; GFX9-DPP-NEXT: v_or3_b32 v31, v42, v1, v0 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-DPP-NEXT: s_mov_b32 s12, s43 ; GFX9-DPP-NEXT: s_mov_b32 s13, s42 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, 0x7ff80000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] ; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, 0x7ff80000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] ; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, 0x7ff80000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] ; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, 0x7ff80000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] ; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, 0x7ff80000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf @@ -8378,18 +8411,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] ; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_readlane_b32 s45, v9, 63 ; GFX9-DPP-NEXT: v_readlane_b32 s44, v8, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[46:47], s[36:37], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[48:49], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[46:47] @@ -8398,34 +8431,36 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: v_max_f64 v[3:4], s[44:45], s[44:45] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0 ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_max_f64 v[2:3], v[5:6], v[3:4] +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v41 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0 +; GFX9-DPP-NEXT: v_or3_b32 v31, v42, v4, v0 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:8 ; GFX9-DPP-NEXT: s_mov_b32 s12, s43 ; GFX9-DPP-NEXT: s_mov_b32 s13, s42 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[52:55], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:8 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s46 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s47 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[52:55], 0 @@ -8455,19 +8490,21 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v2 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v42, v0 ; GFX1064-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v41 ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: v_or3_b32 v31, v42, v1, v0 ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] @@ -8479,29 +8516,29 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1] ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1064-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] ; GFX1064-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1064-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1064-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1064-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] ; GFX1064-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] @@ -8515,21 +8552,21 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v42, v9 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v43, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v44, v9 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1064-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[43:44], v[43:44] ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 @@ -8537,29 +8574,31 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 ; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v5, 10, v41 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 ; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_or3_b32 v31, v42, v5, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 @@ -8590,19 +8629,21 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v2 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v0 ; GFX1032-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v41 ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: v_or3_b32 v31, v42, v1, v0 ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] @@ -8614,79 +8655,81 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1032-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] ; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] ; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v43, v8 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v44, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 -; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[43:44], v[43:44] ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 ; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v5, 10, v41 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 ; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_or3_b32 v31, v42, v5, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 @@ -8731,37 +8774,38 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1] ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] -; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1164-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1164-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 @@ -8777,18 +8821,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 -; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] @@ -8796,8 +8839,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1164-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] @@ -8806,19 +8850,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -8865,80 +8910,82 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: v_dual_mov_b32 v9, 0x7ff80000 :: v_dual_mov_b32 v8, 0 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] -; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, 0x7ff80000 :: v_dual_mov_b32 v10, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, 0x7ff80000 :: v_dual_mov_b32 v10, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, 0x7ff80000 :: v_dual_mov_b32 v10, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] ; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1132-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index cb3291df891af..2cde74fe6bc88 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -719,28 +719,28 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 @@ -756,17 +756,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_cbranch_execz .LBB1_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] +; GFX9-DPP-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-DPP-NEXT: .LBB1_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e64 v0, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-DPP-NEXT: v_min_f32_e32 v0, v0, v6 -; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_min_f32_e32 v0, v2, v0 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] @@ -808,22 +809,22 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4 +; GFX1064-DPP-NEXT: v_max_f32_e32 v5, v5, v5 +; GFX1064-DPP-NEXT: v_min_f32_e32 v4, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f32_e32 v5, v6, v6 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1064-DPP-NEXT: v_min_f32_e32 v4, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f32_e32 v5, v6, v6 +; GFX1064-DPP-NEXT: v_min_f32_e32 v4, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v4, v3 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v3, v4 @@ -881,22 +882,22 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 +; GFX1032-DPP-NEXT: v_max_f32_e32 v5, v5, v5 +; GFX1032-DPP-NEXT: v_min_f32_e32 v4, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v5, v6, v6 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1032-DPP-NEXT: v_min_f32_e32 v4, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v5, v6, v6 +; GFX1032-DPP-NEXT: v_min_f32_e32 v4, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v4, v3 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 @@ -936,54 +937,54 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v2, v1 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX1164-DPP-NEXT: v_min_f32_e32 v2, v2, v3 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX1164-DPP-NEXT: v_min_f32_e32 v2, v2, v3 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v5 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_atomic_min_f32 v4, v0, s[0:1] +; GFX1164-DPP-NEXT: global_atomic_min_f32 v5, v0, s[0:1] ; GFX1164-DPP-NEXT: .LBB1_2: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -1007,45 +1008,46 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_max_f32 v3, v3, v3 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1 -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX1132-DPP-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 +; GFX1132-DPP-NEXT: v_dual_max_f32 v3, v4, v4 :: v_dual_mov_b32 v4, v1 +; GFX1132-DPP-NEXT: v_min_f32_e32 v2, v2, v3 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f32_e32 v3, v4, v4 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v2, v1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v5 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_atomic_min_f32 v4, v0, s[0:1] +; GFX1132-DPP-NEXT: global_atomic_min_f32 v5, v0, s[0:1] ; GFX1132-DPP-NEXT: .LBB1_2: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call float @div.float.value() @@ -1758,28 +1760,28 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 @@ -1795,17 +1797,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_cbranch_execz .LBB3_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] +; GFX9-DPP-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-DPP-NEXT: .LBB3_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e64 v0, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-DPP-NEXT: v_min_f32_e32 v0, v0, v6 -; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_min_f32_e32 v0, v2, v0 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] @@ -1847,22 +1850,22 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4 +; GFX1064-DPP-NEXT: v_max_f32_e32 v5, v5, v5 +; GFX1064-DPP-NEXT: v_min_f32_e32 v4, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f32_e32 v5, v6, v6 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1064-DPP-NEXT: v_min_f32_e32 v4, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f32_e32 v5, v6, v6 +; GFX1064-DPP-NEXT: v_min_f32_e32 v4, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v4, v3 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v3, v4 @@ -1920,22 +1923,22 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 +; GFX1032-DPP-NEXT: v_max_f32_e32 v5, v5, v5 +; GFX1032-DPP-NEXT: v_min_f32_e32 v4, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v5, v6, v6 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1032-DPP-NEXT: v_min_f32_e32 v4, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v5, v6, v6 +; GFX1032-DPP-NEXT: v_min_f32_e32 v4, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v4, v3 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 @@ -1975,54 +1978,54 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v2, v1 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX1164-DPP-NEXT: v_min_f32_e32 v2, v2, v3 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX1164-DPP-NEXT: v_min_f32_e32 v2, v2, v3 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v5 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_atomic_min_f32 v4, v0, s[0:1] +; GFX1164-DPP-NEXT: global_atomic_min_f32 v5, v0, s[0:1] ; GFX1164-DPP-NEXT: .LBB3_2: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -2046,45 +2049,46 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_max_f32 v3, v3, v3 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1 -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX1132-DPP-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 +; GFX1132-DPP-NEXT: v_dual_max_f32 v3, v4, v4 :: v_dual_mov_b32 v4, v1 +; GFX1132-DPP-NEXT: v_min_f32_e32 v2, v2, v3 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f32_e32 v3, v4, v4 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v2, v1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v5 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_atomic_min_f32 v4, v0, s[0:1] +; GFX1132-DPP-NEXT: global_atomic_min_f32 v5, v0, s[0:1] ; GFX1132-DPP-NEXT: .LBB3_2: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call float @div.float.value() @@ -2797,28 +2801,28 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 @@ -2834,17 +2838,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: s_cbranch_execz .LBB5_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] +; GFX9-DPP-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-DPP-NEXT: .LBB5_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e64 v0, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-DPP-NEXT: v_min_f32_e32 v0, v0, v6 -; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_min_f32_e32 v0, v2, v0 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] @@ -2886,22 +2891,22 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4 +; GFX1064-DPP-NEXT: v_max_f32_e32 v5, v5, v5 +; GFX1064-DPP-NEXT: v_min_f32_e32 v4, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f32_e32 v5, v6, v6 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1064-DPP-NEXT: v_min_f32_e32 v4, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f32_e32 v5, v6, v6 +; GFX1064-DPP-NEXT: v_min_f32_e32 v4, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v4, v3 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v3, v4 @@ -2959,22 +2964,22 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 +; GFX1032-DPP-NEXT: v_max_f32_e32 v5, v5, v5 +; GFX1032-DPP-NEXT: v_min_f32_e32 v4, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v5, v6, v6 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1032-DPP-NEXT: v_min_f32_e32 v4, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v5, v6, v6 +; GFX1032-DPP-NEXT: v_min_f32_e32 v4, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v4, v3 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 @@ -3014,54 +3019,54 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v2, v1 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX1164-DPP-NEXT: v_min_f32_e32 v2, v2, v3 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX1164-DPP-NEXT: v_min_f32_e32 v2, v2, v3 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v5 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_atomic_min_f32 v4, v0, s[0:1] +; GFX1164-DPP-NEXT: global_atomic_min_f32 v5, v0, s[0:1] ; GFX1164-DPP-NEXT: .LBB5_2: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -3085,45 +3090,46 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_max_f32 v3, v3, v3 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1 -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX1132-DPP-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 +; GFX1132-DPP-NEXT: v_dual_max_f32 v3, v4, v4 :: v_dual_mov_b32 v4, v1 +; GFX1132-DPP-NEXT: v_min_f32_e32 v2, v2, v3 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f32_e32 v3, v4, v4 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v2, v1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v5 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_atomic_min_f32 v4, v0, s[0:1] +; GFX1132-DPP-NEXT: global_atomic_min_f32 v5, v0, s[0:1] ; GFX1132-DPP-NEXT: .LBB5_2: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call float @div.float.value() @@ -4766,59 +4772,62 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v2 +; GFX9-DPP-NEXT: v_mov_b32_e32 v41, v1 ; GFX9-DPP-NEXT: s_mov_b32 s33, s10 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-DPP-NEXT: v_mov_b32_e32 v42, v0 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v41 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] +; GFX9-DPP-NEXT: v_or3_b32 v31, v42, v1, v0 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-DPP-NEXT: s_mov_b32 s12, s43 ; GFX9-DPP-NEXT: s_mov_b32 s13, s42 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, 0x7ff80000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] ; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, 0x7ff80000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] ; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, 0x7ff80000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] ; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, 0x7ff80000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] ; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, 0x7ff80000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf @@ -4830,18 +4839,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] ; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_readlane_b32 s45, v9, 63 ; GFX9-DPP-NEXT: v_readlane_b32 s44, v8, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[46:47], s[36:37], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[48:49], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[46:47] @@ -4850,34 +4859,36 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_max_f64 v[3:4], s[44:45], s[44:45] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0 ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_min_f64 v[2:3], v[5:6], v[3:4] +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v41 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0 +; GFX9-DPP-NEXT: v_or3_b32 v31, v42, v4, v0 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:8 ; GFX9-DPP-NEXT: s_mov_b32 s12, s43 ; GFX9-DPP-NEXT: s_mov_b32 s13, s42 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[52:55], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:8 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s46 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s47 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[52:55], 0 @@ -4907,19 +4918,21 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v2 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v42, v0 ; GFX1064-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v41 ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: v_or3_b32 v31, v42, v1, v0 ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] @@ -4931,29 +4944,29 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1] ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1064-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1064-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1064-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1064-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] ; GFX1064-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1064-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1064-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1064-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] ; GFX1064-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] @@ -4967,21 +4980,21 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v42, v9 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v43, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v44, v9 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1064-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[43:44], v[43:44] ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 @@ -4989,29 +5002,31 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 ; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v5, 10, v41 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 ; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_or3_b32 v31, v42, v5, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 @@ -5042,19 +5057,21 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v2 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v0 ; GFX1032-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v41 ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: v_or3_b32 v31, v42, v1, v0 ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] @@ -5066,79 +5083,81 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1032-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1032-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1032-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1032-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] ; GFX1032-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1032-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1032-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1032-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] ; GFX1032-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v43, v8 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v44, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 -; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1032-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[43:44], v[43:44] ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 ; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] +; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v5, 10, v41 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 ; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_or3_b32 v31, v42, v5, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 @@ -5183,37 +5202,38 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1] ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] -; GFX1164-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1164-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1164-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1164-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1164-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1164-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1164-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1164-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1164-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1164-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 @@ -5229,18 +5249,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 -; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] @@ -5248,8 +5267,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1164-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] @@ -5258,19 +5278,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -5317,80 +5338,82 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_dual_mov_b32 v9, 0x7ff80000 :: v_dual_mov_b32 v8, 0 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] -; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, 0x7ff80000 :: v_dual_mov_b32 v10, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, 0x7ff80000 :: v_dual_mov_b32 v10, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, 0x7ff80000 :: v_dual_mov_b32 v10, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] ; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1132-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6245,40 +6268,41 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000 ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] ; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] ; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] ; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] ; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf @@ -6290,34 +6314,35 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] ; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 ; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] ; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f64 v[1:2], s[0:1], s[0:1] +; GFX9-DPP-NEXT: v_max_f64 v[0:1], s[0:1], s[0:1] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12] -; GFX9-DPP-NEXT: v_min_f64 v[9:10], v[9:10], v[1:2] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc +; GFX9-DPP-NEXT: v_min_f64 v[9:10], v[9:10], v[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v0, v[9:12], s[2:3] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[11:12] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v1 ; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v0 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX9-DPP-NEXT: .LBB9_3: @@ -6357,29 +6382,29 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, 0x7ff80000, v1, s[0:1] ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1064-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1064-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1064-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] ; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] -; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] -; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] -; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] ; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] @@ -6393,15 +6418,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_2 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1] ; GFX1064-DPP-NEXT: .LBB9_2: @@ -6441,43 +6466,43 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, 0x7ff80000, v1, s0 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1032-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1032-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1032-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] ; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] -; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] -; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] -; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_2 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1] ; GFX1032-DPP-NEXT: .LBB9_2: @@ -6507,37 +6532,38 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, 0x7ff80000, v1, s[0:1] ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0x7ff80000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1164-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0x7ff80000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1164-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0x7ff80000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1164-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 @@ -6551,34 +6577,36 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v6 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] +; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v8, s[0:1] ; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[0:1], v[0:1] ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] +; GFX1164-DPP-NEXT: v_max_f64 v[12:13], v[10:11], v[10:11] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_min_f64 v[6:7], v[6:7], v[0:1] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc +; GFX1164-DPP-NEXT: v_min_f64 v[8:9], v[12:13], v[8:9] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[8:9] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, v7 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, v6 +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] @@ -6609,65 +6637,67 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, 0x7ff80000 :: v_dual_mov_b32 v2, 0 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, 0x7ff80000, v1, s0 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 0x7ff80000 :: v_dual_mov_b32 v4, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 0x7ff80000 :: v_dual_mov_b32 v4, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 0x7ff80000 :: v_dual_mov_b32 v4, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] ; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v6, exec_lo, 0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, v3 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v6 +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] +; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v8, s[0:1] ; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[0:1], v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] +; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[10:11], v[10:11] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_min_f64 v[6:7], v[6:7], v[0:1] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc +; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[12:13], v[8:9] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[8:9] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 @@ -8314,59 +8344,62 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v2 +; GFX9-DPP-NEXT: v_mov_b32_e32 v41, v1 ; GFX9-DPP-NEXT: s_mov_b32 s33, s10 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-DPP-NEXT: v_mov_b32_e32 v42, v0 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v41 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] +; GFX9-DPP-NEXT: v_or3_b32 v31, v42, v1, v0 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-DPP-NEXT: s_mov_b32 s12, s43 ; GFX9-DPP-NEXT: s_mov_b32 s13, s42 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, 0x7ff80000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] ; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, 0x7ff80000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] ; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, 0x7ff80000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] ; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, 0x7ff80000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] ; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, 0x7ff80000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf @@ -8378,18 +8411,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] ; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_readlane_b32 s45, v9, 63 ; GFX9-DPP-NEXT: v_readlane_b32 s44, v8, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[46:47], s[36:37], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[48:49], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[46:47] @@ -8398,34 +8431,36 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: v_max_f64 v[3:4], s[44:45], s[44:45] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0 ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_min_f64 v[2:3], v[5:6], v[3:4] +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v41 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0 +; GFX9-DPP-NEXT: v_or3_b32 v31, v42, v4, v0 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:8 ; GFX9-DPP-NEXT: s_mov_b32 s12, s43 ; GFX9-DPP-NEXT: s_mov_b32 s13, s42 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[52:55], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:8 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s46 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s47 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[52:55], 0 @@ -8455,19 +8490,21 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v2 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v42, v0 ; GFX1064-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v41 ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: v_or3_b32 v31, v42, v1, v0 ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] @@ -8479,29 +8516,29 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1] ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1064-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1064-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1064-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1064-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] ; GFX1064-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1064-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1064-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1064-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] ; GFX1064-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] @@ -8515,21 +8552,21 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v42, v9 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v43, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v44, v9 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1064-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[43:44], v[43:44] ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 @@ -8537,29 +8574,31 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 ; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v5, 10, v41 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 ; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_or3_b32 v31, v42, v5, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 @@ -8590,19 +8629,21 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v2 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v0 ; GFX1032-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v41 ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: v_or3_b32 v31, v42, v1, v0 ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] @@ -8614,79 +8655,81 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1032-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1032-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1032-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1032-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] ; GFX1032-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1032-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1032-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1032-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] ; GFX1032-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v43, v8 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v44, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 -; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[43:44], v[43:44] ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 ; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] +; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v5, 10, v41 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 ; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_or3_b32 v31, v42, v5, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 @@ -8731,37 +8774,38 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1] ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] -; GFX1164-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1164-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1164-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1164-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1164-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1164-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1164-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1164-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1164-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1164-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 @@ -8777,18 +8821,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 -; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] @@ -8796,8 +8839,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1164-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] @@ -8806,19 +8850,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -8865,80 +8910,82 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: v_dual_mov_b32 v9, 0x7ff80000 :: v_dual_mov_b32 v8, 0 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] -; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, 0x7ff80000 :: v_dual_mov_b32 v10, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, 0x7ff80000 :: v_dual_mov_b32 v10, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, 0x7ff80000 :: v_dual_mov_b32 v10, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] ; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1132-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll index 6dc3a1971a485..425719ca11ea4 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -903,23 +903,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 @@ -933,14 +933,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_cbranch_execz .LBB1_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] +; GFX9-DPP-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-DPP-NEXT: .LBB1_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_subrev_f32_e32 v0, s4, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -983,17 +984,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1] -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v4, v4, v6 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 @@ -1010,15 +1011,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB1_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1] +; GFX1064-DPP-NEXT: global_load_dword v1, v0, s[0:1] ; GFX1064-DPP-NEXT: .LBB1_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v7, v[0:1], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 @@ -1060,17 +1062,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0 -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v4, v4, v6 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 @@ -1082,14 +1084,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB1_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1] +; GFX1032-DPP-NEXT: global_load_dword v1, v0, s[0:1] ; GFX1032-DPP-NEXT: .LBB1_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v7, v[0:1], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 @@ -1121,53 +1124,54 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1] -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX1164-DPP-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v5 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_load_b32 v5, v6, s[0:1] +; GFX1164-DPP-NEXT: global_load_b32 v6, v5, s[0:1] ; GFX1164-DPP-NEXT: .LBB1_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc +; GFX1164-DPP-NEXT: v_sub_f32_e32 v5, v6, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v5, v7, v[5:6], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v5 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] @@ -1197,46 +1201,46 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0 -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX1132-DPP-NEXT: v_dual_add_f32 v2, v2, v3 :: v_dual_mov_b32 v3, v1 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v5 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_load_b32 v5, v6, s[0:1] +; GFX1132-DPP-NEXT: global_load_b32 v6, v5, s[0:1] ; GFX1132-DPP-NEXT: .LBB1_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc +; GFX1132-DPP-NEXT: v_sub_f32_e32 v5, v6, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v5, v7, v[5:6], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, v5 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 @@ -2235,23 +2239,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 @@ -2265,14 +2269,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_cbranch_execz .LBB3_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] +; GFX9-DPP-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-DPP-NEXT: .LBB3_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_subrev_f32_e32 v0, s4, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -2315,17 +2320,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1] -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v4, v4, v6 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 @@ -2342,15 +2347,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB3_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1] +; GFX1064-DPP-NEXT: global_load_dword v1, v0, s[0:1] ; GFX1064-DPP-NEXT: .LBB3_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v7, v[0:1], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 @@ -2392,17 +2398,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0 -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v4, v4, v6 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 @@ -2414,14 +2420,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB3_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1] +; GFX1032-DPP-NEXT: global_load_dword v1, v0, s[0:1] ; GFX1032-DPP-NEXT: .LBB3_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v7, v[0:1], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 @@ -2453,53 +2460,54 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1] -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX1164-DPP-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v5 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_load_b32 v5, v6, s[0:1] +; GFX1164-DPP-NEXT: global_load_b32 v6, v5, s[0:1] ; GFX1164-DPP-NEXT: .LBB3_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc +; GFX1164-DPP-NEXT: v_sub_f32_e32 v5, v6, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v5, v7, v[5:6], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v5 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] @@ -2529,46 +2537,46 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0 -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX1132-DPP-NEXT: v_dual_add_f32 v2, v2, v3 :: v_dual_mov_b32 v3, v1 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v5 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_load_b32 v5, v6, s[0:1] +; GFX1132-DPP-NEXT: global_load_b32 v6, v5, s[0:1] ; GFX1132-DPP-NEXT: .LBB3_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc +; GFX1132-DPP-NEXT: v_sub_f32_e32 v5, v6, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v5, v7, v[5:6], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, v5 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 @@ -3567,23 +3575,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 @@ -3597,14 +3605,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_cbranch_execz .LBB5_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] +; GFX9-DPP-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-DPP-NEXT: .LBB5_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_subrev_f32_e32 v0, s4, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -3647,17 +3656,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1] -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v4, v4, v6 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 @@ -3674,15 +3683,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB5_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1] +; GFX1064-DPP-NEXT: global_load_dword v1, v0, s[0:1] ; GFX1064-DPP-NEXT: .LBB5_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v7, v[0:1], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 @@ -3724,17 +3734,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0 -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v4, v4, v6 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 @@ -3746,14 +3756,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB5_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1] +; GFX1032-DPP-NEXT: global_load_dword v1, v0, s[0:1] ; GFX1032-DPP-NEXT: .LBB5_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v7, v[0:1], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 @@ -3785,53 +3796,54 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1] -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX1164-DPP-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v5 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_load_b32 v5, v6, s[0:1] +; GFX1164-DPP-NEXT: global_load_b32 v6, v5, s[0:1] ; GFX1164-DPP-NEXT: .LBB5_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc +; GFX1164-DPP-NEXT: v_sub_f32_e32 v5, v6, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v5, v7, v[5:6], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v5 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] @@ -3861,46 +3873,46 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0 -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX1132-DPP-NEXT: v_dual_add_f32 v2, v2, v3 :: v_dual_mov_b32 v3, v1 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v5 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_load_b32 v5, v6, s[0:1] +; GFX1132-DPP-NEXT: global_load_b32 v6, v5, s[0:1] ; GFX1132-DPP-NEXT: .LBB5_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc +; GFX1132-DPP-NEXT: v_sub_f32_e32 v5, v6, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v5, v7, v[5:6], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, v5 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 @@ -4395,23 +4407,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 @@ -4425,14 +4437,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] +; GFX9-DPP-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-DPP-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_subrev_f32_e32 v0, s4, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -4475,17 +4488,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1] -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v4, v4, v6 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 @@ -4502,15 +4515,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1] +; GFX1064-DPP-NEXT: global_load_dword v1, v0, s[0:1] ; GFX1064-DPP-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v7, v[0:1], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 @@ -4552,17 +4566,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0 -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v4, v4, v6 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 @@ -4574,14 +4588,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1] +; GFX1032-DPP-NEXT: global_load_dword v1, v0, s[0:1] ; GFX1032-DPP-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v7, v[0:1], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 @@ -4613,53 +4628,54 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1] -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX1164-DPP-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v5 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_load_b32 v5, v6, s[0:1] +; GFX1164-DPP-NEXT: global_load_b32 v6, v5, s[0:1] ; GFX1164-DPP-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc +; GFX1164-DPP-NEXT: v_sub_f32_e32 v5, v6, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v5, v7, v[5:6], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v5 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] @@ -4689,46 +4705,46 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0 -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX1132-DPP-NEXT: v_dual_add_f32 v2, v2, v3 :: v_dual_mov_b32 v3, v1 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v5 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_load_b32 v5, v6, s[0:1] +; GFX1132-DPP-NEXT: global_load_b32 v6, v5, s[0:1] ; GFX1132-DPP-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc +; GFX1132-DPP-NEXT: v_sub_f32_e32 v5, v6, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v5, v7, v[5:6], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, v5 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 @@ -5726,23 +5742,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 @@ -5756,14 +5772,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] +; GFX9-DPP-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_subrev_f32_e32 v0, s4, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -5806,17 +5823,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1] -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v4, v4, v6 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 @@ -5833,15 +5850,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1] +; GFX1064-DPP-NEXT: global_load_dword v1, v0, s[0:1] ; GFX1064-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v7, v[0:1], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 @@ -5883,17 +5901,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0 -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v4, v4, v6 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 @@ -5905,14 +5923,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1] +; GFX1032-DPP-NEXT: global_load_dword v1, v0, s[0:1] ; GFX1032-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v7, v[0:1], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 @@ -5944,53 +5963,54 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1] -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX1164-DPP-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v5 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_load_b32 v5, v6, s[0:1] +; GFX1164-DPP-NEXT: global_load_b32 v6, v5, s[0:1] ; GFX1164-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc +; GFX1164-DPP-NEXT: v_sub_f32_e32 v5, v6, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v5, v7, v[5:6], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v5 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] @@ -6020,46 +6040,46 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0 -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX1132-DPP-NEXT: v_dual_add_f32 v2, v2, v3 :: v_dual_mov_b32 v3, v1 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v5 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_load_b32 v5, v6, s[0:1] +; GFX1132-DPP-NEXT: global_load_b32 v6, v5, s[0:1] ; GFX1132-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc +; GFX1132-DPP-NEXT: v_sub_f32_e32 v5, v6, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v5, v7, v[5:6], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, v5 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 @@ -7717,54 +7737,57 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v2 +; GFX9-DPP-NEXT: v_mov_b32_e32 v41, v1 ; GFX9-DPP-NEXT: s_mov_b32 s33, s10 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-DPP-NEXT: v_mov_b32_e32 v42, v0 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v41 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] +; GFX9-DPP-NEXT: v_or3_b32 v31, v42, v1, v0 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-DPP-NEXT: s_mov_b32 s12, s43 ; GFX9-DPP-NEXT: s_mov_b32 s13, s42 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1 ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v13, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v13, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v13, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v13, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v13, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf @@ -7774,18 +7797,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_readlane_b32 s45, v9, 63 ; GFX9-DPP-NEXT: v_readlane_b32 s44, v8, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[46:47], s[36:37], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[48:49], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[46:47] @@ -7799,27 +7822,29 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] ; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4 ; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[52:55], 0 offset:12 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v41 +; GFX9-DPP-NEXT: v_or3_b32 v31, v42, v1, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[52:55], 0 offset:12 ; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-DPP-NEXT: s_mov_b32 s12, s43 ; GFX9-DPP-NEXT: s_mov_b32 s13, s42 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s46 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s47 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[52:55], 0 @@ -7849,19 +7874,21 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v2 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v42, v0 ; GFX1064-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v41 ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: v_or3_b32 v31, v42, v1, v0 ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] @@ -7873,24 +7900,24 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1] ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1064-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] @@ -7901,49 +7928,51 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_add_f64 v[8:9], s[2:3], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v42, v9 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v43, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v44, v9 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1064-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[43:44] ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v5, 10, v41 ; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: v_or3_b32 v31, v42, v5, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 ; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 @@ -7974,19 +8003,21 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v2 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v0 ; GFX1032-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v41 ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: v_or3_b32 v31, v42, v1, v0 ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] @@ -7998,71 +8029,73 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s0 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v43, v8 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v44, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1032-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[43:44] ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v5, 10, v41 ; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: v_or3_b32 v31, v42, v5, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 ; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 @@ -8107,30 +8140,31 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1] ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 @@ -8144,16 +8178,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] @@ -8170,13 +8204,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -8230,46 +8264,45 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s0 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] ; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v41, v8 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 @@ -8285,10 +8318,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -9330,35 +9364,36 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf @@ -9368,19 +9403,19 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 ; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] @@ -9388,12 +9423,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -s[0:1] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v0, v[9:12], s[2:3] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[11:12] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v1 ; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v0 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_2 ; GFX9-DPP-NEXT: .LBB12_3: @@ -9433,24 +9469,24 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s[0:1] ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] @@ -9461,27 +9497,28 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dwordx2 v[9:10], v2, s[0:1] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] ; GFX1064-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[7:8], v[9:10], -v[0:1] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[7:8], v2, v[7:10], s[0:1] glc +; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[9:10] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, v7 +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_2 @@ -9522,49 +9559,50 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s0 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dwordx2 v[9:10], v2, s[0:1] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] ; GFX1032-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[7:8], v[9:10], -v[0:1] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[7:8], v2, v[7:10], s[0:1] glc +; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[9:10] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, v7 +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_2 @@ -9595,30 +9633,31 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s[0:1] ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 @@ -9629,31 +9668,33 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v6 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] +; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v8, s[0:1] ; GFX1164-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f64 v[6:7], v[8:9], -v[0:1] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[8:9] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, v7 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, v6 +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] @@ -9685,56 +9726,56 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s0 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] ; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v6, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, v3 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v6 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] +; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v8, s[0:1] ; GFX1132-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[6:7], v[8:9], -v[0:1] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[8:9] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 @@ -10763,35 +10804,36 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf @@ -10801,19 +10843,19 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 ; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB14_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] @@ -10821,12 +10863,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -s[0:1] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v0, v[9:12], s[2:3] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[11:12] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v1 ; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v0 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB14_2 ; GFX9-DPP-NEXT: .LBB14_3: @@ -10866,24 +10909,24 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s[0:1] ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] @@ -10894,27 +10937,28 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB14_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dwordx2 v[9:10], v2, s[0:1] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] ; GFX1064-DPP-NEXT: .LBB14_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[7:8], v[9:10], -v[0:1] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[7:8], v2, v[7:10], s[0:1] glc +; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[9:10] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, v7 +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB14_2 @@ -10955,49 +10999,50 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s0 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB14_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dwordx2 v[9:10], v2, s[0:1] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] ; GFX1032-DPP-NEXT: .LBB14_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[7:8], v[9:10], -v[0:1] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[7:8], v2, v[7:10], s[0:1] glc +; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[9:10] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, v7 +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB14_2 @@ -11028,30 +11073,31 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s[0:1] ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 @@ -11062,31 +11108,33 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v6 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB14_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] +; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v8, s[0:1] ; GFX1164-DPP-NEXT: .LBB14_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f64 v[6:7], v[8:9], -v[0:1] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[8:9] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, v7 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, v6 +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] @@ -11118,56 +11166,56 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s0 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] ; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v6, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, v3 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v6 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB14_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] +; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v8, s[0:1] ; GFX1132-DPP-NEXT: .LBB14_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[6:7], v[8:9], -v[0:1] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[8:9] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 @@ -11678,35 +11726,36 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf @@ -11716,19 +11765,19 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 ; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB15_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] @@ -11736,12 +11785,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -s[0:1] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v0, v[9:12], s[2:3] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[11:12] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v1 ; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v0 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB15_2 ; GFX9-DPP-NEXT: .LBB15_3: @@ -11781,24 +11831,24 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s[0:1] ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] @@ -11809,27 +11859,28 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB15_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dwordx2 v[9:10], v2, s[0:1] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] ; GFX1064-DPP-NEXT: .LBB15_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[7:8], v[9:10], -v[0:1] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[7:8], v2, v[7:10], s[0:1] glc +; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[9:10] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, v7 +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB15_2 @@ -11870,49 +11921,50 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s0 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB15_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dwordx2 v[9:10], v2, s[0:1] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] ; GFX1032-DPP-NEXT: .LBB15_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[7:8], v[9:10], -v[0:1] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[7:8], v2, v[7:10], s[0:1] glc +; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[9:10] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, v7 +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB15_2 @@ -11943,30 +11995,31 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s[0:1] ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 @@ -11977,31 +12030,33 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v6 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB15_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] +; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v8, s[0:1] ; GFX1164-DPP-NEXT: .LBB15_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f64 v[6:7], v[8:9], -v[0:1] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[8:9] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, v7 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, v6 +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] @@ -12033,56 +12088,56 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s0 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] ; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v6, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, v3 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v6 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB15_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] +; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v8, s[0:1] ; GFX1132-DPP-NEXT: .LBB15_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[6:7], v[8:9], -v[0:1] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[8:9] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 @@ -13771,54 +13826,57 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v2 +; GFX9-DPP-NEXT: v_mov_b32_e32 v41, v1 ; GFX9-DPP-NEXT: s_mov_b32 s33, s10 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-DPP-NEXT: v_mov_b32_e32 v42, v0 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v41 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] +; GFX9-DPP-NEXT: v_or3_b32 v31, v42, v1, v0 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-DPP-NEXT: s_mov_b32 s12, s43 ; GFX9-DPP-NEXT: s_mov_b32 s13, s42 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1 ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v13, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v13, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v13, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v13, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v13, 1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf @@ -13828,18 +13886,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_readlane_b32 s45, v9, 63 ; GFX9-DPP-NEXT: v_readlane_b32 s44, v8, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[46:47], s[36:37], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[48:49], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[46:47] @@ -13853,27 +13911,29 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] ; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4 ; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v41 +; GFX9-DPP-NEXT: v_or3_b32 v31, v42, v1, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] ; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[52:55], 0 offset:12 ; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-DPP-NEXT: s_mov_b32 s12, s43 ; GFX9-DPP-NEXT: s_mov_b32 s13, s42 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s46 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s47 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[52:55], 0 @@ -13903,19 +13963,21 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v2 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v42, v0 ; GFX1064-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v41 ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: v_or3_b32 v31, v42, v1, v0 ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] @@ -13927,24 +13989,24 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1] ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] -; GFX1064-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1064-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] @@ -13955,49 +14017,51 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: v_add_f64 v[8:9], s[2:3], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v42, v9 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v43, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v44, v9 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1064-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[43:44] ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v5, 10, v41 ; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: v_or3_b32 v31, v42, v5, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 ; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 @@ -14028,19 +14092,21 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v2 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v0 ; GFX1032-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v41 ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: v_or3_b32 v31, v42, v1, v0 ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] @@ -14052,71 +14118,73 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s0 ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v43, v8 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v44, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1032-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[43:44] ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v0, 20, v40 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v5, 10, v41 ; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: v_or3_b32 v31, v42, v5, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 ; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 @@ -14161,30 +14229,31 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1] ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 @@ -14198,16 +14267,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] @@ -14224,13 +14293,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -14284,46 +14353,45 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s0 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v11, v9 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v10, v8 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] ; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v41, v8 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 @@ -14339,10 +14407,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] diff --git a/llvm/test/CodeGen/AMDGPU/licm-wwm.mir b/llvm/test/CodeGen/AMDGPU/licm-wwm.mir index fc20674971a71..699e109f3b8cd 100644 --- a/llvm/test/CodeGen/AMDGPU/licm-wwm.mir +++ b/llvm/test/CodeGen/AMDGPU/licm-wwm.mir @@ -2,27 +2,28 @@ # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -run-pass=early-machinelicm,si-wqm -o - %s | FileCheck -check-prefix=GCN %s # Machine LICM may hoist an intruction from a WWM region, which will force SI-WQM pass -# to create a second WWM region. This is an unwanted hoisting. +# to create a second WWM region. This is an unwanted hoisting. Make sure it does not happen. --- name: licm_move_wwm tracksRegLiveness: true +machineFunctionInfo: + hasInitWholeWave: true + body: | ; GCN-LABEL: name: licm_move_wwm ; GCN: bb.0: ; GCN-NEXT: successors: %bb.1(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[ENTER_STRICT_WWM:%[0-9]+]]:sreg_32 = ENTER_STRICT_WWM -1, implicit-def $exec, implicit-def $scc, implicit $exec - ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; GCN-NEXT: $exec_lo = EXIT_STRICT_WWM [[ENTER_STRICT_WWM]] ; GCN-NEXT: S_BRANCH %bb.1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[ENTER_STRICT_WWM1:%[0-9]+]]:sreg_32 = ENTER_STRICT_WWM -1, implicit-def $exec, implicit-def $scc, implicit $exec + ; GCN-NEXT: [[ENTER_STRICT_WWM:%[0-9]+]]:sreg_32 = ENTER_STRICT_WWM -1, implicit-def $exec, implicit-def $scc, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; GCN-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MOV_B32_e32_]], implicit $exec - ; GCN-NEXT: $exec_lo = EXIT_STRICT_WWM [[ENTER_STRICT_WWM1]] + ; GCN-NEXT: $exec_lo = EXIT_STRICT_WWM [[ENTER_STRICT_WWM]] ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY [[V_READFIRSTLANE_B32_]] ; GCN-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[COPY]], implicit-def $scc ; GCN-NEXT: S_CBRANCH_EXECNZ %bb.1, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll index 6f841c88a6d8b..81566626c58f6 100644 --- a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll +++ b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll @@ -4,11 +4,11 @@ define amdgpu_cs void @if_then(ptr addrspace(8) inreg %input, ptr addrspace(8) inreg %output, <3 x i32> %LocalInvocationId) { ; GCN-LABEL: if_then: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GCN-NEXT: ; %bb.1: ; %.bb0 -; GCN-NEXT: v_mov_b32_e32 v3, 1 +; GCN-NEXT: v_mov_b32_e32 v4, 1 ; GCN-NEXT: ; %bb.2: ; %.merge ; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, 3, v0 @@ -16,19 +16,20 @@ define amdgpu_cs void @if_then(ptr addrspace(8) inreg %input, ptr addrspace(8) i ; GCN-NEXT: s_cbranch_execz .LBB0_4 ; GCN-NEXT: ; %bb.3: ; %.then ; GCN-NEXT: s_or_saveexec_b32 s1, -1 -; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v3, s1 -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, v4, s1 +; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GCN-NEXT: s_mov_b32 exec_lo, s1 -; GCN-NEXT: v_mov_b32_e32 v0, v2 -; GCN-NEXT: v_mov_b32_e32 v4, -1 -; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: buffer_store_dword v4, v0, s[4:7], 0 offen +; GCN-NEXT: v_mov_b32_e32 v0, v3 +; GCN-NEXT: v_mov_b32_e32 v5, -1 +; GCN-NEXT: v_mov_b32_e32 v4, v1 +; GCN-NEXT: buffer_store_dword v5, v0, s[4:7], 0 offen ; GCN-NEXT: .LBB0_4: ; %.end ; GCN-NEXT: s_waitcnt_depctr 0xffe3 ; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GCN-NEXT: v_mov_b32_e32 v0, -1 -; GCN-NEXT: buffer_store_dword v0, v3, s[4:7], 0 offen +; GCN-NEXT: buffer_store_dword v0, v4, s[4:7], 0 offen ; GCN-NEXT: s_endpgm .entry: %LocalInvocationId.i0 = extractelement <3 x i32> %LocalInvocationId, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll b/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll index 3519befabd3bc..67e8499e45893 100644 --- a/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll @@ -24,11 +24,11 @@ define amdgpu_cs void @should_not_hoist_set_inactive(<4 x i32> inreg %i14, i32 i ; GCN-NEXT: ; %bb.3: ; %bb1 ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; GCN-NEXT: s_or_saveexec_b32 s9, -1 -; GCN-NEXT: v_cndmask_b32_e64 v3, 0, s4, s9 -; GCN-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NEXT: v_mov_b32_dpp v4, v3 row_xmask:1 row_mask:0xf bank_mask:0xf +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: v_cndmask_b32_e64 v4, 0, s4, s9 +; GCN-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GCN-NEXT: s_mov_b32 exec_lo, s9 -; GCN-NEXT: v_mov_b32_e32 v0, v4 +; GCN-NEXT: v_mov_b32_e32 v0, v3 ; GCN-NEXT: s_and_b32 exec_lo, exec_lo, s5 ; GCN-NEXT: s_cbranch_execz .LBB0_1 ; GCN-NEXT: ; %bb.4: ; %bb2 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll index 1089093ea691c..cc2ecb49c88b2 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -250,32 +250,33 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) ; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 -; GFX9-O3-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-O3-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-O3-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[34:35] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v3, 0, v4, s[34:35] ; GFX9-O3-NEXT: s_nop 1 -; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-O3-NEXT: v_mov_b32_dpp v2, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-O3-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O3-NEXT: v_mov_b32_e32 v4, v2 ; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-O3-NEXT: s_and_saveexec_b64 s[34:35], vcc ; GFX9-O3-NEXT: ; %bb.1: ; %if ; GFX9-O3-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[36:37] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v5, s[36:37] ; GFX9-O3-NEXT: s_nop 1 ; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-O3-NEXT: v_add_u32_e32 v2, v2, v1 ; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-O3-NEXT: v_mov_b32_e32 v5, v1 +; GFX9-O3-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-O3-NEXT: ; %bb.2: ; %merge ; GFX9-O3-NEXT: s_or_b64 exec, exec, s[34:35] -; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX9-O3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-O3-NEXT: v_and_b32_e32 v0, 2, v0 @@ -283,6 +284,7 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) ; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll index 08cc2e4ec7d79..b110dcc00d155 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -217,31 +217,31 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; ; GFX9-O3-LABEL: cfg: ; GFX9-O3: ; %bb.0: ; %entry -; GFX9-O3-NEXT: buffer_load_dwordx2 v[3:4], off, s[0:3], 0 -; GFX9-O3-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-O3-NEXT: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-O3-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[4:5] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v3, 0, v4, s[4:5] ; GFX9-O3-NEXT: s_nop 1 -; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-O3-NEXT: v_mov_b32_dpp v2, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-O3-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O3-NEXT: v_mov_b32_e32 v4, v2 ; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-O3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-O3-NEXT: ; %bb.1: ; %if ; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[6:7] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v5, s[6:7] ; GFX9-O3-NEXT: s_nop 1 ; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-O3-NEXT: v_add_u32_e32 v2, v2, v1 ; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O3-NEXT: v_mov_b32_e32 v5, v1 +; GFX9-O3-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-O3-NEXT: ; %bb.2: ; %merge ; GFX9-O3-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX9-O3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-O3-NEXT: v_and_b32_e32 v0, 2, v0 @@ -1069,31 +1069,31 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; ; GFX9-O3-LABEL: strict_wwm_cfg: ; GFX9-O3: ; %bb.0: ; %entry -; GFX9-O3-NEXT: buffer_load_dwordx2 v[3:4], off, s[0:3], 0 -; GFX9-O3-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-O3-NEXT: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-O3-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[4:5] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v3, 0, v4, s[4:5] ; GFX9-O3-NEXT: s_nop 1 -; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-O3-NEXT: v_mov_b32_dpp v2, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-O3-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O3-NEXT: v_mov_b32_e32 v4, v2 ; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-O3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-O3-NEXT: ; %bb.1: ; %if ; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[6:7] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v5, s[6:7] ; GFX9-O3-NEXT: s_nop 1 ; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-O3-NEXT: v_add_u32_e32 v2, v2, v1 ; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O3-NEXT: v_mov_b32_e32 v5, v1 +; GFX9-O3-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-O3-NEXT: ; %bb.2: ; %merge ; GFX9-O3-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX9-O3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-O3-NEXT: v_and_b32_e32 v0, 2, v0 diff --git a/llvm/test/CodeGen/MIR/AMDGPU/init-whole.wave.ll b/llvm/test/CodeGen/MIR/AMDGPU/init-whole.wave.ll index f3b8deff61918..1b67772c140eb 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/init-whole.wave.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/init-whole.wave.ll @@ -15,3 +15,21 @@ bb.1: bb.2: ret void } + +; GCN-LABEL: name: wwm +; GCN: hasInitWholeWave: true +define void @wwm(ptr addrspace(1) inreg %p) { + %val = load i32, ptr addrspace(1) %p + %wwm = tail call i32 @llvm.amdgcn.wwm.i32(i32 %val) + store i32 %wwm, ptr addrspace(1) %p + ret void +} + +; GCN-LABEL: name: strict_wwm +; GCN: hasInitWholeWave: true +define void @strict_wwm(ptr addrspace(1) inreg %p) { + %val = load i32, ptr addrspace(1) %p + %wwm = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %val) + store i32 %wwm, ptr addrspace(1) %p + ret void +}