diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 02cd125eeff09..a913472525cf5 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -4042,7 +4042,7 @@ SDValue DAGCombiner::visitSUB(SDNode *N) { } // fold B = sra (A, size(A)-1); sub (xor (A, B), B) -> (abs A) - if (hasOperation(ISD::ABS, VT) && + if ((!LegalOperations || hasOperation(ISD::ABS, VT)) && sd_match(N1, m_Sra(m_Value(A), m_SpecificInt(BitWidth - 1))) && sd_match(N0, m_Xor(m_Specific(A), m_Specific(N1)))) return DAG.getNode(ISD::ABS, DL, VT, A); @@ -9526,7 +9526,7 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { } // fold Y = sra (X, size(X)-1); xor (add (X, Y), Y) -> (abs X) - if (TLI.isOperationLegalOrCustom(ISD::ABS, VT)) { + if (!LegalOperations || hasOperation(ISD::ABS, VT)) { SDValue A = N0Opcode == ISD::ADD ? N0 : N1; SDValue S = N0Opcode == ISD::SRA ? N0 : N1; if (A.getOpcode() == ISD::ADD && S.getOpcode() == ISD::SRA) { diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 78aaaca4e185b..e3270471981cc 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -1623,9 +1623,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setPrefFunctionAlignment(Align(1ULL << Subtarget->getPrefLoopLogAlignment())); setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4)); - - if (Subtarget->isThumb() || Subtarget->isThumb2()) - setTargetDAGCombine(ISD::ABS); } bool ARMTargetLowering::useSoftFloat() const { @@ -13504,18 +13501,6 @@ static SDValue PerformVSetCCToVCTPCombine(SDNode *N, DCI.DAG.getZExtOrTrunc(Op1S, DL, MVT::i32)); } -static SDValue PerformABSCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - const ARMSubtarget *Subtarget) { - SelectionDAG &DAG = DCI.DAG; - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - - if (TLI.isOperationLegal(N->getOpcode(), N->getValueType(0))) - return SDValue(); - - return TLI.expandABS(N, DAG); -} - /// PerformADDECombine - Target-specific dag combine transform from /// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or /// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL @@ -18879,7 +18864,6 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case ISD::SELECT: return PerformSELECTCombine(N, DCI, Subtarget); case ISD::VSELECT: return PerformVSELECTCombine(N, DCI, Subtarget); case ISD::SETCC: return PerformVSetCCToVCTPCombine(N, DCI, Subtarget); - case ISD::ABS: return PerformABSCombine(N, DCI, Subtarget); case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget); case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget); case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget); diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td index c600478b66402..fcabc9076e4d3 100644 --- a/llvm/lib/Target/ARM/ARMInstrNEON.td +++ b/llvm/lib/Target/ARM/ARMInstrNEON.td @@ -5670,22 +5670,6 @@ def : Pat<(v2i64 (zext (abdu (v2i32 DPR:$opA), (v2i32 DPR:$opB)))), (VABDLuv2i64 DPR:$opA, DPR:$opB)>; } -// ISD::ABS is not legal for v2i64, so VABDL needs to be matched from the -// shift/xor pattern for ABS. -// TODO: Remove me. -def abd_shr : - PatFrag<(ops node:$in1, node:$in2, node:$shift), - (ARMvshrsImm (sub (zext node:$in1), - (zext node:$in2)), (i32 $shift))>; - -let Predicates = [HasNEON] in { -def : Pat<(xor (v2i64 (abd_shr (v2i32 DPR:$opA), (v2i32 DPR:$opB), 63)), - (v2i64 (add (sub (zext (v2i32 DPR:$opA)), - (zext (v2i32 DPR:$opB))), - (abd_shr (v2i32 DPR:$opA), (v2i32 DPR:$opB), 63)))), - (VABDLuv2i64 DPR:$opA, DPR:$opB)>; -} - // VABA : Vector Absolute Difference and Accumulate defm VABAs : N3VIntOp_QHS<0,0,0b0111,1, IIC_VABAD, IIC_VABAQ, "vaba", "s", abds, add>; diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll index 5c40a4ce13e31..cb59121d69708 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll @@ -135,31 +135,31 @@ define i32 @select_sdiv_lhs_opaque_const0_i32(i1 %cond) { ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: v_cndmask_b32_e32 v0, 5, v1, vcc -; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_xor_b32_e32 v0, v0, v1 -; GCN-NEXT: v_cvt_f32_u32_e32 v2, v0 -; GCN-NEXT: v_sub_u32_e32 v3, vcc, 0, v0 +; GCN-NEXT: v_sub_u32_e32 v1, vcc, 0, v0 +; GCN-NEXT: v_max_i32_e32 v1, v0, v1 +; GCN-NEXT: v_cvt_f32_u32_e32 v2, v1 +; GCN-NEXT: v_sub_u32_e32 v3, vcc, 0, v1 ; GCN-NEXT: s_mov_b32 s4, 0xf4240 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GCN-NEXT: v_ashrrev_i32_e32 v0, 31, v0 ; GCN-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GCN-NEXT: v_mul_lo_u32 v3, v3, v2 ; GCN-NEXT: v_mul_hi_u32 v3, v2, v3 ; GCN-NEXT: v_add_u32_e32 v2, vcc, v2, v3 ; GCN-NEXT: v_mul_hi_u32 v2, v2, s4 -; GCN-NEXT: v_mul_lo_u32 v3, v2, v0 +; GCN-NEXT: v_mul_lo_u32 v3, v2, v1 ; GCN-NEXT: v_add_u32_e32 v4, vcc, 1, v2 ; GCN-NEXT: v_sub_u32_e32 v3, vcc, 0xf4240, v3 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v0 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v1 ; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GCN-NEXT: v_sub_u32_e64 v4, s[4:5], v3, v0 +; GCN-NEXT: v_sub_u32_e64 v4, s[4:5], v3, v1 ; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GCN-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GCN-NEXT: v_xor_b32_e32 v0, v0, v1 -; GCN-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v1 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; GCN-NEXT: v_xor_b32_e32 v1, v1, v0 +; GCN-NEXT: v_sub_u32_e32 v0, vcc, v1, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] %select = select i1 %cond, i32 ptrtoint (ptr addrspace(1) @gv to i32), i32 5 %op = sdiv i32 1000000, %select @@ -217,31 +217,31 @@ define i32 @select_sdiv_lhs_opaque_const1_i32(i1 %cond) { ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: v_cndmask_b32_e64 v0, v1, 5, vcc -; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_xor_b32_e32 v0, v0, v1 -; GCN-NEXT: v_cvt_f32_u32_e32 v2, v0 -; GCN-NEXT: v_sub_u32_e32 v3, vcc, 0, v0 +; GCN-NEXT: v_sub_u32_e32 v1, vcc, 0, v0 +; GCN-NEXT: v_max_i32_e32 v1, v0, v1 +; GCN-NEXT: v_cvt_f32_u32_e32 v2, v1 +; GCN-NEXT: v_sub_u32_e32 v3, vcc, 0, v1 ; GCN-NEXT: s_mov_b32 s4, 0xf4240 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GCN-NEXT: v_ashrrev_i32_e32 v0, 31, v0 ; GCN-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GCN-NEXT: v_mul_lo_u32 v3, v3, v2 ; GCN-NEXT: v_mul_hi_u32 v3, v2, v3 ; GCN-NEXT: v_add_u32_e32 v2, vcc, v2, v3 ; GCN-NEXT: v_mul_hi_u32 v2, v2, s4 -; GCN-NEXT: v_mul_lo_u32 v3, v2, v0 +; GCN-NEXT: v_mul_lo_u32 v3, v2, v1 ; GCN-NEXT: v_add_u32_e32 v4, vcc, 1, v2 ; GCN-NEXT: v_sub_u32_e32 v3, vcc, 0xf4240, v3 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v0 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v1 ; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GCN-NEXT: v_sub_u32_e64 v4, s[4:5], v3, v0 +; GCN-NEXT: v_sub_u32_e64 v4, s[4:5], v3, v1 ; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GCN-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GCN-NEXT: v_xor_b32_e32 v0, v0, v1 -; GCN-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v1 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; GCN-NEXT: v_xor_b32_e32 v1, v1, v0 +; GCN-NEXT: v_sub_u32_e32 v0, vcc, v1, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] %select = select i1 %cond, i32 5, i32 ptrtoint (ptr addrspace(1) @gv to i32) %op = sdiv i32 1000000, %select diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index 2ad28b8dd6ecf..8144fb7a3b646 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -245,39 +245,36 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_ashr_i32 s8, s3, 31 -; GFX6-NEXT: s_add_i32 s3, s3, s8 -; GFX6-NEXT: s_xor_b32 s3, s3, s8 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX6-NEXT: s_sub_i32 s4, 0, s3 -; GFX6-NEXT: s_ashr_i32 s9, s2, 31 -; GFX6-NEXT: s_add_i32 s2, s2, s9 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: s_xor_b32 s2, s2, s9 +; GFX6-NEXT: s_abs_i32 s8, s3 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX6-NEXT: s_sub_i32 s4, 0, s8 ; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: s_xor_b32 s1, s2, s3 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: s_ashr_i32 s1, s1, 31 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 ; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: s_xor_b32 s0, s9, s8 +; GFX6-NEXT: s_abs_i32 s0, s2 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 -; GFX6-NEXT: v_readfirstlane_b32 s1, v0 -; GFX6-NEXT: s_mul_i32 s1, s1, s3 -; GFX6-NEXT: s_sub_i32 s1, s2, s1 -; GFX6-NEXT: s_sub_i32 s2, s1, s3 +; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s8 +; GFX6-NEXT: s_sub_i32 s0, s0, s2 +; GFX6-NEXT: s_sub_i32 s2, s0, s8 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: s_cmp_ge_u32 s1, s3 +; GFX6-NEXT: s_cmp_ge_u32 s0, s8 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: s_cselect_b32 s1, s2, s1 +; GFX6-NEXT: s_cselect_b32 s0, s2, s0 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: s_cmp_ge_u32 s1, s3 +; GFX6-NEXT: s_cmp_ge_u32 s0, s8 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 +; GFX6-NEXT: v_xor_b32_e32 v0, s1, v0 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s1, v0 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -286,16 +283,13 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s4, s3, 31 -; GFX9-NEXT: s_add_i32 s3, s3, s4 -; GFX9-NEXT: s_xor_b32 s3, s3, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_ashr_i32 s5, s2, 31 -; GFX9-NEXT: s_add_i32 s2, s2, s5 -; GFX9-NEXT: s_xor_b32 s4, s5, s4 +; GFX9-NEXT: s_abs_i32 s4, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX9-NEXT: s_sub_i32 s5, 0, s4 +; GFX9-NEXT: s_xor_b32 s3, s2, s3 +; GFX9-NEXT: s_abs_i32 s2, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s2, s2, s5 -; GFX9-NEXT: s_sub_i32 s5, 0, s3 +; GFX9-NEXT: s_ashr_i32 s3, s3, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0 @@ -303,18 +297,18 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 ; GFX9-NEXT: s_add_i32 s6, s6, s5 ; GFX9-NEXT: s_mul_hi_u32 s5, s2, s6 -; GFX9-NEXT: s_mul_i32 s6, s5, s3 +; GFX9-NEXT: s_mul_i32 s6, s5, s4 ; GFX9-NEXT: s_sub_i32 s2, s2, s6 ; GFX9-NEXT: s_add_i32 s7, s5, 1 -; GFX9-NEXT: s_sub_i32 s6, s2, s3 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_sub_i32 s6, s2, s4 +; GFX9-NEXT: s_cmp_ge_u32 s2, s4 ; GFX9-NEXT: s_cselect_b32 s5, s7, s5 ; GFX9-NEXT: s_cselect_b32 s2, s6, s2 ; GFX9-NEXT: s_add_i32 s6, s5, 1 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_cmp_ge_u32 s2, s4 ; GFX9-NEXT: s_cselect_b32 s2, s6, s5 -; GFX9-NEXT: s_xor_b32 s2, s2, s4 -; GFX9-NEXT: s_sub_i32 s2, s2, s4 +; GFX9-NEXT: s_xor_b32 s2, s2, s3 +; GFX9-NEXT: s_sub_i32 s2, s2, s3 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm @@ -366,37 +360,36 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX6-LABEL: srem_i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_ashr_i32 s4, s3, 31 -; GFX6-NEXT: s_add_i32 s3, s3, s4 -; GFX6-NEXT: s_xor_b32 s4, s3, s4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX6-NEXT: s_sub_i32 s3, 0, s4 -; GFX6-NEXT: s_ashr_i32 s5, s2, 31 -; GFX6-NEXT: s_add_i32 s2, s2, s5 +; GFX6-NEXT: s_abs_i32 s3, s3 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX6-NEXT: s_sub_i32 s4, 0, s3 +; GFX6-NEXT: s_abs_i32 s8, s2 +; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: s_xor_b32 s6, s2, s5 -; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_ashr_i32 s0, s2, 31 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 -; GFX6-NEXT: v_readfirstlane_b32 s7, v0 -; GFX6-NEXT: s_mul_i32 s7, s7, s4 -; GFX6-NEXT: s_sub_i32 s6, s6, s7 -; GFX6-NEXT: s_sub_i32 s7, s6, s4 -; GFX6-NEXT: s_cmp_ge_u32 s6, s4 -; GFX6-NEXT: s_cselect_b32 s6, s7, s6 -; GFX6-NEXT: s_sub_i32 s7, s6, s4 -; GFX6-NEXT: s_cmp_ge_u32 s6, s4 -; GFX6-NEXT: s_cselect_b32 s4, s7, s6 -; GFX6-NEXT: s_xor_b32 s4, s4, s5 -; GFX6-NEXT: s_sub_i32 s4, s4, s5 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 +; GFX6-NEXT: v_readfirstlane_b32 s1, v0 +; GFX6-NEXT: s_mul_i32 s1, s1, s3 +; GFX6-NEXT: s_sub_i32 s1, s8, s1 +; GFX6-NEXT: s_sub_i32 s2, s1, s3 +; GFX6-NEXT: s_cmp_ge_u32 s1, s3 +; GFX6-NEXT: s_cselect_b32 s1, s2, s1 +; GFX6-NEXT: s_sub_i32 s2, s1, s3 +; GFX6-NEXT: s_cmp_ge_u32 s1, s3 +; GFX6-NEXT: s_cselect_b32 s1, s2, s1 +; GFX6-NEXT: s_xor_b32 s1, s1, s0 +; GFX6-NEXT: s_sub_i32 s0, s1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_i32: @@ -404,15 +397,12 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s4, s3, 31 -; GFX9-NEXT: s_add_i32 s3, s3, s4 -; GFX9-NEXT: s_xor_b32 s3, s3, s4 +; GFX9-NEXT: s_abs_i32 s3, s3 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX9-NEXT: s_sub_i32 s5, 0, s3 ; GFX9-NEXT: s_ashr_i32 s4, s2, 31 -; GFX9-NEXT: s_add_i32 s2, s2, s4 +; GFX9-NEXT: s_abs_i32 s2, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s2, s2, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0 @@ -1857,126 +1847,114 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: s_mov_b32 s19, 0xf000 ; GFX6-NEXT: s_mov_b32 s18, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_ashr_i32 s2, s12, 31 -; GFX6-NEXT: s_add_i32 s3, s12, s2 -; GFX6-NEXT: s_xor_b32 s3, s3, s2 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX6-NEXT: s_sub_i32 s4, 0, s3 +; GFX6-NEXT: s_abs_i32 s2, s12 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: s_sub_i32 s3, 0, s2 +; GFX6-NEXT: s_xor_b32 s4, s8, s12 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 -; GFX6-NEXT: s_ashr_i32 s4, s8, 31 -; GFX6-NEXT: s_add_i32 s5, s8, s4 -; GFX6-NEXT: s_xor_b32 s5, s5, s4 +; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX6-NEXT: s_abs_i32 s3, s8 +; GFX6-NEXT: s_ashr_i32 s8, s4, 31 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: s_xor_b32 s8, s4, s2 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 -; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s2, s2, s3 -; GFX6-NEXT: s_sub_i32 s2, s5, s2 -; GFX6-NEXT: s_sub_i32 s4, s2, s3 -; GFX6-NEXT: s_cmp_ge_u32 s2, s3 +; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 +; GFX6-NEXT: v_readfirstlane_b32 s4, v0 +; GFX6-NEXT: s_mul_i32 s4, s4, s2 +; GFX6-NEXT: s_sub_i32 s3, s3, s4 +; GFX6-NEXT: s_sub_i32 s4, s3, s2 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: s_cselect_b32 s2, s4, s2 +; GFX6-NEXT: s_cselect_b32 s3, s4, s3 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s2, s3 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 ; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX6-NEXT: s_ashr_i32 s4, s13, 31 -; GFX6-NEXT: s_add_i32 s5, s13, s4 -; GFX6-NEXT: s_xor_b32 s5, s5, s4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5 -; GFX6-NEXT: s_sub_i32 s6, 0, s5 +; GFX6-NEXT: s_abs_i32 s4, s13 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s4 +; GFX6-NEXT: s_sub_i32 s5, 0, s4 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 +; GFX6-NEXT: s_xor_b32 s6, s9, s13 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] -; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 ; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX6-NEXT: v_mul_lo_u32 v3, s6, v2 -; GFX6-NEXT: s_ashr_i32 s6, s9, 31 -; GFX6-NEXT: s_add_i32 s7, s9, s6 -; GFX6-NEXT: s_xor_b32 s7, s7, s6 +; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 +; GFX6-NEXT: v_mul_lo_u32 v3, s5, v2 +; GFX6-NEXT: s_abs_i32 s5, s9 +; GFX6-NEXT: s_ashr_i32 s9, s6, 31 ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX6-NEXT: s_xor_b32 s9, s6, s4 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_mul_hi_u32 v2, s7, v2 -; GFX6-NEXT: v_readfirstlane_b32 s4, v2 -; GFX6-NEXT: s_mul_i32 s4, s4, s5 -; GFX6-NEXT: s_sub_i32 s4, s7, s4 -; GFX6-NEXT: s_sub_i32 s6, s4, s5 -; GFX6-NEXT: s_cmp_ge_u32 s4, s5 +; GFX6-NEXT: v_mul_hi_u32 v2, s5, v2 +; GFX6-NEXT: v_readfirstlane_b32 s6, v2 +; GFX6-NEXT: s_mul_i32 s6, s6, s4 +; GFX6-NEXT: s_sub_i32 s5, s5, s6 +; GFX6-NEXT: s_sub_i32 s6, s5, s4 +; GFX6-NEXT: s_cmp_ge_u32 s5, s4 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v2 -; GFX6-NEXT: s_cselect_b32 s4, s6, s4 +; GFX6-NEXT: s_cselect_b32 s5, s6, s5 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s4, s5 +; GFX6-NEXT: s_cmp_ge_u32 s5, s4 ; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX6-NEXT: s_ashr_i32 s6, s14, 31 -; GFX6-NEXT: s_add_i32 s7, s14, s6 -; GFX6-NEXT: s_xor_b32 s7, s7, s6 -; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s7 -; GFX6-NEXT: s_sub_i32 s12, 0, s7 +; GFX6-NEXT: s_abs_i32 s6, s14 +; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s6 +; GFX6-NEXT: s_sub_i32 s7, 0, s6 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[4:5] -; GFX6-NEXT: v_xor_b32_e32 v2, s9, v2 ; GFX6-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GFX6-NEXT: v_mul_lo_u32 v5, s12, v4 -; GFX6-NEXT: s_ashr_i32 s12, s10, 31 -; GFX6-NEXT: s_add_i32 s10, s10, s12 -; GFX6-NEXT: s_xor_b32 s10, s10, s12 +; GFX6-NEXT: v_mul_lo_u32 v5, s7, v4 +; GFX6-NEXT: s_abs_i32 s7, s10 +; GFX6-NEXT: s_xor_b32 s10, s10, s14 +; GFX6-NEXT: s_ashr_i32 s10, s10, 31 ; GFX6-NEXT: v_mul_hi_u32 v5, v4, v5 -; GFX6-NEXT: s_xor_b32 s12, s12, s6 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GFX6-NEXT: v_mul_hi_u32 v4, s10, v4 -; GFX6-NEXT: v_readfirstlane_b32 s6, v4 -; GFX6-NEXT: s_mul_i32 s6, s6, s7 -; GFX6-NEXT: s_sub_i32 s6, s10, s6 -; GFX6-NEXT: s_sub_i32 s10, s6, s7 -; GFX6-NEXT: s_cmp_ge_u32 s6, s7 +; GFX6-NEXT: v_mul_hi_u32 v4, s7, v4 +; GFX6-NEXT: v_readfirstlane_b32 s12, v4 +; GFX6-NEXT: s_mul_i32 s12, s12, s6 +; GFX6-NEXT: s_sub_i32 s7, s7, s12 +; GFX6-NEXT: s_sub_i32 s12, s7, s6 +; GFX6-NEXT: s_cmp_ge_u32 s7, s6 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v4 -; GFX6-NEXT: s_cselect_b32 s6, s10, s6 +; GFX6-NEXT: s_cselect_b32 s7, s12, s7 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s6, s7 +; GFX6-NEXT: s_cmp_ge_u32 s7, s6 ; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX6-NEXT: s_ashr_i32 s10, s15, 31 -; GFX6-NEXT: s_add_i32 s13, s15, s10 -; GFX6-NEXT: s_xor_b32 s13, s13, s10 -; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s13 -; GFX6-NEXT: s_sub_i32 s0, 0, s13 +; GFX6-NEXT: s_abs_i32 s12, s15 +; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s12 +; GFX6-NEXT: s_sub_i32 s0, 0, s12 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v6 -; GFX6-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[6:7] -; GFX6-NEXT: v_xor_b32_e32 v4, s12, v4 +; GFX6-NEXT: s_abs_i32 s1, s11 ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 -; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s9, v2 -; GFX6-NEXT: v_mul_lo_u32 v2, s0, v3 -; GFX6-NEXT: s_ashr_i32 s0, s11, 31 -; GFX6-NEXT: s_add_i32 s1, s11, s0 -; GFX6-NEXT: s_xor_b32 s1, s1, s0 -; GFX6-NEXT: v_mul_hi_u32 v2, v3, v2 -; GFX6-NEXT: s_xor_b32 s0, s0, s10 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_mul_hi_u32 v3, s1, v2 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s12, v4 -; GFX6-NEXT: v_readfirstlane_b32 s2, v3 -; GFX6-NEXT: s_mul_i32 s2, s2, s13 +; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v1 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v3, v4, v5, s[6:7] +; GFX6-NEXT: v_xor_b32_e32 v1, s9, v1 +; GFX6-NEXT: v_mul_lo_u32 v2, s0, v6 +; GFX6-NEXT: s_xor_b32 s0, s11, s15 +; GFX6-NEXT: v_xor_b32_e32 v3, s10, v3 +; GFX6-NEXT: s_ashr_i32 s0, s0, 31 +; GFX6-NEXT: v_mul_hi_u32 v2, v6, v2 +; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s9, v1 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v6, v2 +; GFX6-NEXT: v_mul_hi_u32 v4, s1, v2 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s10, v3 +; GFX6-NEXT: v_readfirstlane_b32 s2, v4 +; GFX6-NEXT: s_mul_i32 s2, s2, s12 ; GFX6-NEXT: s_sub_i32 s1, s1, s2 -; GFX6-NEXT: s_sub_i32 s2, s1, s13 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v3 -; GFX6-NEXT: s_cmp_ge_u32 s1, s13 +; GFX6-NEXT: s_sub_i32 s2, s1, s12 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v4 +; GFX6-NEXT: s_cmp_ge_u32 s1, s12 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; GFX6-NEXT: s_cselect_b32 s1, s2, s1 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v3 -; GFX6-NEXT: s_cmp_ge_u32 s1, s13 +; GFX6-NEXT: s_cmp_ge_u32 s1, s12 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX6-NEXT: v_xor_b32_e32 v3, s0, v3 @@ -1990,16 +1968,13 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s2, s8, 31 -; GFX9-NEXT: s_add_i32 s3, s8, s2 -; GFX9-NEXT: s_xor_b32 s3, s3, s2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_ashr_i32 s8, s4, 31 -; GFX9-NEXT: s_add_i32 s4, s4, s8 -; GFX9-NEXT: s_xor_b32 s2, s8, s2 +; GFX9-NEXT: s_abs_i32 s2, s8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX9-NEXT: s_xor_b32 s3, s4, s8 +; GFX9-NEXT: s_sub_i32 s8, 0, s2 +; GFX9-NEXT: s_abs_i32 s4, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s4, s4, s8 -; GFX9-NEXT: s_sub_i32 s8, 0, s3 +; GFX9-NEXT: s_ashr_i32 s3, s3, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s12, v0 @@ -2007,108 +1982,99 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX9-NEXT: s_mul_hi_u32 s8, s12, s8 ; GFX9-NEXT: s_add_i32 s12, s12, s8 ; GFX9-NEXT: s_mul_hi_u32 s8, s4, s12 -; GFX9-NEXT: s_mul_i32 s12, s8, s3 +; GFX9-NEXT: s_mul_i32 s12, s8, s2 ; GFX9-NEXT: s_sub_i32 s4, s4, s12 ; GFX9-NEXT: s_add_i32 s13, s8, 1 -; GFX9-NEXT: s_sub_i32 s12, s4, s3 -; GFX9-NEXT: s_cmp_ge_u32 s4, s3 +; GFX9-NEXT: s_sub_i32 s12, s4, s2 +; GFX9-NEXT: s_cmp_ge_u32 s4, s2 ; GFX9-NEXT: s_cselect_b32 s8, s13, s8 ; GFX9-NEXT: s_cselect_b32 s4, s12, s4 ; GFX9-NEXT: s_add_i32 s12, s8, 1 -; GFX9-NEXT: s_cmp_ge_u32 s4, s3 -; GFX9-NEXT: s_cselect_b32 s3, s12, s8 -; GFX9-NEXT: s_ashr_i32 s4, s9, 31 -; GFX9-NEXT: s_add_i32 s8, s9, s4 -; GFX9-NEXT: s_xor_b32 s8, s8, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX9-NEXT: s_ashr_i32 s9, s5, 31 -; GFX9-NEXT: s_xor_b32 s3, s3, s2 -; GFX9-NEXT: s_add_i32 s5, s5, s9 +; GFX9-NEXT: s_cmp_ge_u32 s4, s2 +; GFX9-NEXT: s_cselect_b32 s2, s12, s8 +; GFX9-NEXT: s_abs_i32 s4, s9 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX9-NEXT: s_xor_b32 s2, s2, s3 +; GFX9-NEXT: s_xor_b32 s8, s5, s9 +; GFX9-NEXT: s_sub_i32 s9, 0, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s4, s9, s4 -; GFX9-NEXT: s_sub_i32 s2, s3, s2 -; GFX9-NEXT: s_xor_b32 s3, s5, s9 +; GFX9-NEXT: s_sub_i32 s2, s2, s3 +; GFX9-NEXT: s_abs_i32 s5, s5 +; GFX9-NEXT: s_ashr_i32 s8, s8, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s5, 0, s8 -; GFX9-NEXT: v_readfirstlane_b32 s9, v0 -; GFX9-NEXT: s_mul_i32 s5, s5, s9 -; GFX9-NEXT: s_mul_hi_u32 s5, s9, s5 -; GFX9-NEXT: s_add_i32 s9, s9, s5 -; GFX9-NEXT: s_mul_hi_u32 s5, s3, s9 -; GFX9-NEXT: s_mul_i32 s9, s5, s8 -; GFX9-NEXT: s_sub_i32 s3, s3, s9 -; GFX9-NEXT: s_add_i32 s12, s5, 1 -; GFX9-NEXT: s_sub_i32 s9, s3, s8 -; GFX9-NEXT: s_cmp_ge_u32 s3, s8 -; GFX9-NEXT: s_cselect_b32 s5, s12, s5 +; GFX9-NEXT: v_readfirstlane_b32 s3, v0 +; GFX9-NEXT: s_mul_i32 s9, s9, s3 +; GFX9-NEXT: s_mul_hi_u32 s9, s3, s9 +; GFX9-NEXT: s_add_i32 s3, s3, s9 +; GFX9-NEXT: s_mul_hi_u32 s3, s5, s3 +; GFX9-NEXT: s_mul_i32 s9, s3, s4 +; GFX9-NEXT: s_sub_i32 s5, s5, s9 +; GFX9-NEXT: s_add_i32 s12, s3, 1 +; GFX9-NEXT: s_sub_i32 s9, s5, s4 +; GFX9-NEXT: s_cmp_ge_u32 s5, s4 +; GFX9-NEXT: s_cselect_b32 s3, s12, s3 +; GFX9-NEXT: s_cselect_b32 s5, s9, s5 +; GFX9-NEXT: s_add_i32 s9, s3, 1 +; GFX9-NEXT: s_cmp_ge_u32 s5, s4 ; GFX9-NEXT: s_cselect_b32 s3, s9, s3 -; GFX9-NEXT: s_add_i32 s9, s5, 1 -; GFX9-NEXT: s_cmp_ge_u32 s3, s8 -; GFX9-NEXT: s_cselect_b32 s3, s9, s5 -; GFX9-NEXT: s_ashr_i32 s5, s10, 31 -; GFX9-NEXT: s_add_i32 s8, s10, s5 -; GFX9-NEXT: s_xor_b32 s8, s8, s5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX9-NEXT: s_ashr_i32 s9, s6, 31 -; GFX9-NEXT: s_xor_b32 s3, s3, s4 -; GFX9-NEXT: s_add_i32 s6, s6, s9 +; GFX9-NEXT: s_abs_i32 s4, s10 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX9-NEXT: s_xor_b32 s3, s3, s8 +; GFX9-NEXT: s_sub_i32 s9, 0, s4 +; GFX9-NEXT: s_sub_i32 s3, s3, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s5, s9, s5 -; GFX9-NEXT: s_sub_i32 s3, s3, s4 -; GFX9-NEXT: s_xor_b32 s4, s6, s9 +; GFX9-NEXT: s_xor_b32 s5, s6, s10 +; GFX9-NEXT: s_abs_i32 s6, s6 +; GFX9-NEXT: s_ashr_i32 s5, s5, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s6, 0, s8 -; GFX9-NEXT: v_readfirstlane_b32 s9, v0 -; GFX9-NEXT: s_mul_i32 s6, s6, s9 -; GFX9-NEXT: s_mul_hi_u32 s6, s9, s6 -; GFX9-NEXT: s_add_i32 s9, s9, s6 -; GFX9-NEXT: s_mul_hi_u32 s6, s4, s9 -; GFX9-NEXT: s_mul_i32 s9, s6, s8 -; GFX9-NEXT: s_sub_i32 s4, s4, s9 -; GFX9-NEXT: s_add_i32 s10, s6, 1 -; GFX9-NEXT: s_sub_i32 s9, s4, s8 -; GFX9-NEXT: s_cmp_ge_u32 s4, s8 -; GFX9-NEXT: s_cselect_b32 s6, s10, s6 -; GFX9-NEXT: s_cselect_b32 s4, s9, s4 -; GFX9-NEXT: s_add_i32 s9, s6, 1 -; GFX9-NEXT: s_cmp_ge_u32 s4, s8 -; GFX9-NEXT: s_cselect_b32 s4, s9, s6 -; GFX9-NEXT: s_ashr_i32 s6, s11, 31 -; GFX9-NEXT: s_add_i32 s8, s11, s6 -; GFX9-NEXT: s_xor_b32 s8, s8, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s8 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: s_ashr_i32 s2, s7, 31 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_readfirstlane_b32 s8, v0 +; GFX9-NEXT: s_mul_i32 s9, s9, s8 +; GFX9-NEXT: s_mul_hi_u32 s9, s8, s9 +; GFX9-NEXT: s_add_i32 s8, s8, s9 +; GFX9-NEXT: s_mul_hi_u32 s8, s6, s8 +; GFX9-NEXT: s_mul_i32 s9, s8, s4 +; GFX9-NEXT: s_sub_i32 s6, s6, s9 +; GFX9-NEXT: s_add_i32 s10, s8, 1 +; GFX9-NEXT: s_sub_i32 s9, s6, s4 +; GFX9-NEXT: s_cmp_ge_u32 s6, s4 +; GFX9-NEXT: s_cselect_b32 s8, s10, s8 +; GFX9-NEXT: s_cselect_b32 s6, s9, s6 +; GFX9-NEXT: s_add_i32 s9, s8, 1 +; GFX9-NEXT: s_cmp_ge_u32 s6, s4 +; GFX9-NEXT: s_cselect_b32 s4, s9, s8 +; GFX9-NEXT: s_abs_i32 s6, s11 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6 ; GFX9-NEXT: s_xor_b32 s4, s4, s5 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: s_add_i32 s7, s7, s2 -; GFX9-NEXT: s_xor_b32 s6, s2, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_xor_b32 s2, s7, s11 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX9-NEXT: s_abs_i32 s3, s7 +; GFX9-NEXT: s_sub_i32 s7, 0, s6 ; GFX9-NEXT: s_sub_i32 s4, s4, s5 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: s_xor_b32 s2, s7, s2 -; GFX9-NEXT: s_sub_i32 s5, 0, s8 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_readfirstlane_b32 s7, v1 -; GFX9-NEXT: s_mul_i32 s5, s5, s7 -; GFX9-NEXT: s_mul_hi_u32 s5, s7, s5 -; GFX9-NEXT: s_add_i32 s7, s7, s5 -; GFX9-NEXT: s_mul_hi_u32 s5, s2, s7 -; GFX9-NEXT: s_mul_i32 s7, s5, s8 -; GFX9-NEXT: s_sub_i32 s2, s2, s7 -; GFX9-NEXT: s_add_i32 s9, s5, 1 -; GFX9-NEXT: s_sub_i32 s7, s2, s8 -; GFX9-NEXT: s_cmp_ge_u32 s2, s8 -; GFX9-NEXT: s_cselect_b32 s5, s9, s5 -; GFX9-NEXT: s_cselect_b32 s2, s7, s2 +; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX9-NEXT: s_ashr_i32 s2, s2, 31 +; GFX9-NEXT: v_readfirstlane_b32 s5, v2 +; GFX9-NEXT: s_mul_i32 s7, s7, s5 +; GFX9-NEXT: s_mul_hi_u32 s7, s5, s7 +; GFX9-NEXT: s_add_i32 s5, s5, s7 +; GFX9-NEXT: s_mul_hi_u32 s5, s3, s5 +; GFX9-NEXT: s_mul_i32 s7, s5, s6 +; GFX9-NEXT: s_sub_i32 s3, s3, s7 +; GFX9-NEXT: s_add_i32 s8, s5, 1 +; GFX9-NEXT: s_sub_i32 s7, s3, s6 +; GFX9-NEXT: s_cmp_ge_u32 s3, s6 +; GFX9-NEXT: s_cselect_b32 s5, s8, s5 +; GFX9-NEXT: s_cselect_b32 s3, s7, s3 ; GFX9-NEXT: s_add_i32 s7, s5, 1 -; GFX9-NEXT: s_cmp_ge_u32 s2, s8 -; GFX9-NEXT: s_cselect_b32 s2, s7, s5 -; GFX9-NEXT: s_xor_b32 s2, s2, s6 -; GFX9-NEXT: s_sub_i32 s2, s2, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_cmp_ge_u32 s3, s6 +; GFX9-NEXT: s_cselect_b32 s3, s7, s5 +; GFX9-NEXT: s_xor_b32 s3, s3, s2 +; GFX9-NEXT: s_sub_i32 s2, s3, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm @@ -2279,116 +2245,104 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_ashr_i32 s2, s8, 31 -; GFX6-NEXT: s_add_i32 s3, s8, s2 -; GFX6-NEXT: s_xor_b32 s2, s3, s2 +; GFX6-NEXT: s_abs_i32 s2, s8 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX6-NEXT: s_sub_i32 s3, 0, s2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 -; GFX6-NEXT: s_ashr_i32 s3, s4, 31 -; GFX6-NEXT: s_add_i32 s4, s4, s3 -; GFX6-NEXT: s_xor_b32 s4, s4, s3 +; GFX6-NEXT: s_abs_i32 s3, s4 +; GFX6-NEXT: s_ashr_i32 s4, s4, 31 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: s_mul_i32 s8, s8, s2 -; GFX6-NEXT: s_sub_i32 s4, s4, s8 -; GFX6-NEXT: s_sub_i32 s8, s4, s2 -; GFX6-NEXT: s_cmp_ge_u32 s4, s2 -; GFX6-NEXT: s_cselect_b32 s4, s8, s4 -; GFX6-NEXT: s_sub_i32 s8, s4, s2 -; GFX6-NEXT: s_cmp_ge_u32 s4, s2 -; GFX6-NEXT: s_cselect_b32 s2, s8, s4 -; GFX6-NEXT: s_ashr_i32 s4, s9, 31 -; GFX6-NEXT: s_add_i32 s8, s9, s4 -; GFX6-NEXT: s_xor_b32 s4, s8, s4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX6-NEXT: s_sub_i32 s8, 0, s4 -; GFX6-NEXT: s_xor_b32 s2, s2, s3 -; GFX6-NEXT: s_sub_i32 s9, s2, s3 +; GFX6-NEXT: s_sub_i32 s3, s3, s8 +; GFX6-NEXT: s_sub_i32 s8, s3, s2 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 +; GFX6-NEXT: s_cselect_b32 s3, s8, s3 +; GFX6-NEXT: s_sub_i32 s8, s3, s2 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 +; GFX6-NEXT: s_cselect_b32 s2, s8, s3 +; GFX6-NEXT: s_abs_i32 s3, s9 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX6-NEXT: s_sub_i32 s8, 0, s3 +; GFX6-NEXT: s_xor_b32 s2, s2, s4 +; GFX6-NEXT: s_sub_i32 s4, s2, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s8, v0 -; GFX6-NEXT: s_ashr_i32 s8, s5, 31 -; GFX6-NEXT: s_add_i32 s5, s5, s8 -; GFX6-NEXT: s_xor_b32 s5, s5, s8 +; GFX6-NEXT: s_abs_i32 s8, s5 +; GFX6-NEXT: s_ashr_i32 s5, s5, 31 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s2, s2, s4 -; GFX6-NEXT: s_sub_i32 s2, s5, s2 -; GFX6-NEXT: s_sub_i32 s3, s2, s4 -; GFX6-NEXT: s_cmp_ge_u32 s2, s4 -; GFX6-NEXT: s_cselect_b32 s2, s3, s2 -; GFX6-NEXT: s_sub_i32 s3, s2, s4 -; GFX6-NEXT: s_cmp_ge_u32 s2, s4 -; GFX6-NEXT: s_cselect_b32 s2, s3, s2 -; GFX6-NEXT: s_ashr_i32 s3, s10, 31 -; GFX6-NEXT: s_add_i32 s4, s10, s3 -; GFX6-NEXT: s_xor_b32 s3, s4, s3 +; GFX6-NEXT: s_mul_i32 s2, s2, s3 +; GFX6-NEXT: s_sub_i32 s2, s8, s2 +; GFX6-NEXT: s_sub_i32 s8, s2, s3 +; GFX6-NEXT: s_cmp_ge_u32 s2, s3 +; GFX6-NEXT: s_cselect_b32 s2, s8, s2 +; GFX6-NEXT: s_sub_i32 s8, s2, s3 +; GFX6-NEXT: s_cmp_ge_u32 s2, s3 +; GFX6-NEXT: s_cselect_b32 s2, s8, s2 +; GFX6-NEXT: s_abs_i32 s3, s10 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX6-NEXT: s_sub_i32 s4, 0, s3 -; GFX6-NEXT: s_xor_b32 s2, s2, s8 +; GFX6-NEXT: s_sub_i32 s8, 0, s3 +; GFX6-NEXT: s_xor_b32 s2, s2, s5 +; GFX6-NEXT: s_sub_i32 s5, s2, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 -; GFX6-NEXT: s_ashr_i32 s4, s6, 31 -; GFX6-NEXT: s_add_i32 s5, s6, s4 -; GFX6-NEXT: s_xor_b32 s5, s5, s4 +; GFX6-NEXT: v_mul_lo_u32 v1, s8, v0 +; GFX6-NEXT: s_abs_i32 s8, s6 +; GFX6-NEXT: s_ashr_i32 s6, s6, 31 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: s_sub_i32 s6, s2, s8 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s2, v0 ; GFX6-NEXT: s_mul_i32 s2, s2, s3 -; GFX6-NEXT: s_sub_i32 s2, s5, s2 -; GFX6-NEXT: s_sub_i32 s5, s2, s3 +; GFX6-NEXT: s_sub_i32 s2, s8, s2 +; GFX6-NEXT: s_sub_i32 s8, s2, s3 ; GFX6-NEXT: s_cmp_ge_u32 s2, s3 -; GFX6-NEXT: s_cselect_b32 s2, s5, s2 -; GFX6-NEXT: s_sub_i32 s5, s2, s3 +; GFX6-NEXT: s_cselect_b32 s2, s8, s2 +; GFX6-NEXT: s_sub_i32 s8, s2, s3 ; GFX6-NEXT: s_cmp_ge_u32 s2, s3 -; GFX6-NEXT: s_cselect_b32 s5, s5, s2 -; GFX6-NEXT: s_ashr_i32 s2, s11, 31 -; GFX6-NEXT: s_add_i32 s3, s11, s2 -; GFX6-NEXT: s_xor_b32 s8, s3, s2 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX6-NEXT: s_sub_i32 s10, 0, s8 -; GFX6-NEXT: s_xor_b32 s5, s5, s4 -; GFX6-NEXT: s_sub_i32 s4, s5, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: s_cselect_b32 s8, s8, s2 +; GFX6-NEXT: s_abs_i32 s9, s11 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 +; GFX6-NEXT: s_sub_i32 s2, 0, s9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s9 -; GFX6-NEXT: s_ashr_i32 s9, s7, 31 -; GFX6-NEXT: s_add_i32 s7, s7, s9 -; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 -; GFX6-NEXT: s_xor_b32 s7, s7, s9 -; GFX6-NEXT: v_mul_hi_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GFX6-NEXT: v_mul_hi_u32 v2, s7, v1 -; GFX6-NEXT: v_mov_b32_e32 v1, s6 -; GFX6-NEXT: v_readfirstlane_b32 s5, v2 -; GFX6-NEXT: s_mul_i32 s5, s5, s8 -; GFX6-NEXT: s_sub_i32 s5, s7, s5 -; GFX6-NEXT: s_sub_i32 s6, s5, s8 -; GFX6-NEXT: s_cmp_ge_u32 s5, s8 -; GFX6-NEXT: s_cselect_b32 s5, s6, s5 -; GFX6-NEXT: s_sub_i32 s6, s5, s8 -; GFX6-NEXT: s_cmp_ge_u32 s5, s8 -; GFX6-NEXT: s_cselect_b32 s5, s6, s5 -; GFX6-NEXT: s_xor_b32 s5, s5, s9 -; GFX6-NEXT: s_sub_i32 s5, s5, s9 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: v_mov_b32_e32 v3, s5 +; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_abs_i32 s4, s7 +; GFX6-NEXT: v_mul_lo_u32 v1, s2, v2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mul_hi_u32 v3, v2, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_ashr_i32 s5, s7, 31 +; GFX6-NEXT: s_xor_b32 s7, s8, s6 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_mul_hi_u32 v2, s4, v2 +; GFX6-NEXT: s_sub_i32 s6, s7, s6 +; GFX6-NEXT: v_readfirstlane_b32 s7, v2 +; GFX6-NEXT: s_mul_i32 s7, s7, s9 +; GFX6-NEXT: s_sub_i32 s4, s4, s7 +; GFX6-NEXT: s_sub_i32 s7, s4, s9 +; GFX6-NEXT: s_cmp_ge_u32 s4, s9 +; GFX6-NEXT: s_cselect_b32 s4, s7, s4 +; GFX6-NEXT: s_sub_i32 s7, s4, s9 +; GFX6-NEXT: s_cmp_ge_u32 s4, s9 +; GFX6-NEXT: s_cselect_b32 s4, s7, s4 +; GFX6-NEXT: s_xor_b32 s4, s4, s5 +; GFX6-NEXT: s_sub_i32 s4, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, s4 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -2398,15 +2352,12 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s2, s8, 31 -; GFX9-NEXT: s_add_i32 s3, s8, s2 -; GFX9-NEXT: s_xor_b32 s2, s3, s2 +; GFX9-NEXT: s_abs_i32 s2, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: s_sub_i32 s8, 0, s2 ; GFX9-NEXT: s_ashr_i32 s3, s4, 31 -; GFX9-NEXT: s_add_i32 s4, s4, s3 +; GFX9-NEXT: s_abs_i32 s4, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s4, s4, s3 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s12, v0 @@ -2422,51 +2373,44 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX9-NEXT: s_sub_i32 s8, s4, s2 ; GFX9-NEXT: s_cmp_ge_u32 s4, s2 ; GFX9-NEXT: s_cselect_b32 s2, s8, s4 -; GFX9-NEXT: s_ashr_i32 s4, s9, 31 -; GFX9-NEXT: s_add_i32 s8, s9, s4 -; GFX9-NEXT: s_xor_b32 s4, s8, s4 +; GFX9-NEXT: s_abs_i32 s4, s9 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: s_ashr_i32 s8, s5, 31 ; GFX9-NEXT: s_xor_b32 s2, s2, s3 -; GFX9-NEXT: s_add_i32 s5, s5, s8 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_sub_i32 s9, 0, s4 ; GFX9-NEXT: s_sub_i32 s2, s2, s3 -; GFX9-NEXT: s_xor_b32 s3, s5, s8 -; GFX9-NEXT: s_sub_i32 s5, 0, s4 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_ashr_i32 s8, s5, 31 +; GFX9-NEXT: s_abs_i32 s5, s5 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s9, v0 -; GFX9-NEXT: s_mul_i32 s5, s5, s9 -; GFX9-NEXT: s_mul_hi_u32 s5, s9, s5 -; GFX9-NEXT: s_add_i32 s9, s9, s5 -; GFX9-NEXT: s_mul_hi_u32 s5, s3, s9 -; GFX9-NEXT: s_mul_i32 s5, s5, s4 -; GFX9-NEXT: s_sub_i32 s3, s3, s5 +; GFX9-NEXT: v_readfirstlane_b32 s3, v0 +; GFX9-NEXT: s_mul_i32 s9, s9, s3 +; GFX9-NEXT: s_mul_hi_u32 s9, s3, s9 +; GFX9-NEXT: s_add_i32 s3, s3, s9 +; GFX9-NEXT: s_mul_hi_u32 s3, s5, s3 +; GFX9-NEXT: s_mul_i32 s3, s3, s4 +; GFX9-NEXT: s_sub_i32 s3, s5, s3 ; GFX9-NEXT: s_sub_i32 s5, s3, s4 ; GFX9-NEXT: s_cmp_ge_u32 s3, s4 ; GFX9-NEXT: s_cselect_b32 s3, s5, s3 ; GFX9-NEXT: s_sub_i32 s5, s3, s4 ; GFX9-NEXT: s_cmp_ge_u32 s3, s4 ; GFX9-NEXT: s_cselect_b32 s3, s5, s3 -; GFX9-NEXT: s_ashr_i32 s4, s10, 31 -; GFX9-NEXT: s_add_i32 s5, s10, s4 -; GFX9-NEXT: s_xor_b32 s4, s5, s4 +; GFX9-NEXT: s_abs_i32 s4, s10 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GFX9-NEXT: s_xor_b32 s3, s3, s8 +; GFX9-NEXT: s_sub_i32 s9, 0, s4 ; GFX9-NEXT: s_sub_i32 s3, s3, s8 -; GFX9-NEXT: s_sub_i32 s8, 0, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_ashr_i32 s5, s6, 31 -; GFX9-NEXT: s_add_i32 s6, s6, s5 -; GFX9-NEXT: s_xor_b32 s6, s6, s5 +; GFX9-NEXT: s_abs_i32 s6, s6 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_readfirstlane_b32 s9, v0 -; GFX9-NEXT: s_mul_i32 s8, s8, s9 -; GFX9-NEXT: s_mul_hi_u32 s8, s9, s8 -; GFX9-NEXT: s_add_i32 s9, s9, s8 -; GFX9-NEXT: s_mul_hi_u32 s8, s6, s9 +; GFX9-NEXT: v_readfirstlane_b32 s8, v0 +; GFX9-NEXT: s_mul_i32 s9, s9, s8 +; GFX9-NEXT: s_mul_hi_u32 s9, s8, s9 +; GFX9-NEXT: s_add_i32 s8, s8, s9 +; GFX9-NEXT: s_mul_hi_u32 s8, s6, s8 ; GFX9-NEXT: s_mul_i32 s8, s8, s4 ; GFX9-NEXT: s_sub_i32 s6, s6, s8 ; GFX9-NEXT: s_sub_i32 s8, s6, s4 @@ -2475,36 +2419,34 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX9-NEXT: s_sub_i32 s8, s6, s4 ; GFX9-NEXT: s_cmp_ge_u32 s6, s4 ; GFX9-NEXT: s_cselect_b32 s4, s8, s6 -; GFX9-NEXT: s_ashr_i32 s6, s11, 31 -; GFX9-NEXT: s_add_i32 s8, s11, s6 -; GFX9-NEXT: s_xor_b32 s6, s8, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6 +; GFX9-NEXT: s_abs_i32 s6, s11 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 +; GFX9-NEXT: s_xor_b32 s4, s4, s5 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: s_ashr_i32 s2, s7, 31 -; GFX9-NEXT: s_xor_b32 s3, s4, s5 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX9-NEXT: s_add_i32 s4, s7, s2 -; GFX9-NEXT: s_sub_i32 s3, s3, s5 -; GFX9-NEXT: s_sub_i32 s5, 0, s6 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_abs_i32 s3, s7 +; GFX9-NEXT: s_sub_i32 s7, 0, s6 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: s_xor_b32 s4, s4, s2 -; GFX9-NEXT: v_readfirstlane_b32 s7, v2 -; GFX9-NEXT: s_mul_i32 s5, s5, s7 -; GFX9-NEXT: s_mul_hi_u32 s5, s7, s5 -; GFX9-NEXT: s_add_i32 s7, s7, s5 -; GFX9-NEXT: s_mul_hi_u32 s5, s4, s7 -; GFX9-NEXT: s_mul_i32 s5, s5, s6 ; GFX9-NEXT: s_sub_i32 s4, s4, s5 -; GFX9-NEXT: s_sub_i32 s5, s4, s6 -; GFX9-NEXT: s_cmp_ge_u32 s4, s6 -; GFX9-NEXT: s_cselect_b32 s4, s5, s4 -; GFX9-NEXT: s_sub_i32 s5, s4, s6 -; GFX9-NEXT: s_cmp_ge_u32 s4, s6 -; GFX9-NEXT: s_cselect_b32 s4, s5, s4 -; GFX9-NEXT: s_xor_b32 s4, s4, s2 -; GFX9-NEXT: s_sub_i32 s2, s4, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_readfirstlane_b32 s5, v2 +; GFX9-NEXT: s_mul_i32 s7, s7, s5 +; GFX9-NEXT: s_mul_hi_u32 s7, s5, s7 +; GFX9-NEXT: s_add_i32 s5, s5, s7 +; GFX9-NEXT: s_mul_hi_u32 s5, s3, s5 +; GFX9-NEXT: s_mul_i32 s5, s5, s6 +; GFX9-NEXT: s_sub_i32 s3, s3, s5 +; GFX9-NEXT: s_sub_i32 s5, s3, s6 +; GFX9-NEXT: s_cmp_ge_u32 s3, s6 +; GFX9-NEXT: s_cselect_b32 s3, s5, s3 +; GFX9-NEXT: s_sub_i32 s5, s3, s6 +; GFX9-NEXT: s_cmp_ge_u32 s3, s6 +; GFX9-NEXT: s_cselect_b32 s3, s5, s3 +; GFX9-NEXT: s_xor_b32 s3, s3, s2 +; GFX9-NEXT: s_sub_i32 s2, s3, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm @@ -6538,71 +6480,65 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s6 -; GFX6-NEXT: s_ashr_i32 s3, s2, 31 -; GFX6-NEXT: s_add_i32 s2, s2, s3 -; GFX6-NEXT: s_xor_b32 s2, s2, s3 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX6-NEXT: s_sub_i32 s6, 0, s2 +; GFX6-NEXT: s_abs_i32 s3, s2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX6-NEXT: s_sub_i32 s6, 0, s3 +; GFX6-NEXT: s_xor_b32 s2, s4, s2 ; GFX6-NEXT: s_lshl_b32 s7, 0x1000, s7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s6, v0 -; GFX6-NEXT: s_ashr_i32 s6, s4, 31 -; GFX6-NEXT: s_add_i32 s4, s4, s6 -; GFX6-NEXT: s_xor_b32 s4, s4, s6 +; GFX6-NEXT: s_abs_i32 s6, s4 +; GFX6-NEXT: s_ashr_i32 s4, s2, 31 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: s_xor_b32 s6, s6, s3 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX6-NEXT: v_readfirstlane_b32 s3, v0 -; GFX6-NEXT: s_mul_i32 s3, s3, s2 -; GFX6-NEXT: s_sub_i32 s3, s4, s3 -; GFX6-NEXT: s_sub_i32 s4, s3, s2 +; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s3 +; GFX6-NEXT: s_sub_i32 s2, s6, s2 +; GFX6-NEXT: s_sub_i32 s6, s2, s3 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: s_cmp_ge_u32 s3, s2 +; GFX6-NEXT: s_cmp_ge_u32 s2, s3 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: s_cselect_b32 s3, s4, s3 +; GFX6-NEXT: s_cselect_b32 s2, s6, s2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: s_cmp_ge_u32 s3, s2 +; GFX6-NEXT: s_cmp_ge_u32 s2, s3 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX6-NEXT: s_ashr_i32 s4, s7, 31 -; GFX6-NEXT: s_add_i32 s7, s7, s4 -; GFX6-NEXT: s_xor_b32 s7, s7, s4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s7 -; GFX6-NEXT: s_sub_i32 s8, 0, s7 +; GFX6-NEXT: s_abs_i32 s6, s7 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 +; GFX6-NEXT: s_sub_i32 s2, 0, s6 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX6-NEXT: s_xor_b32 s7, s5, s7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s6, v0 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_abs_i32 s5, s5 +; GFX6-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX6-NEXT: s_ashr_i32 s7, s7, 31 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: v_mul_lo_u32 v3, s2, v2 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_mul_lo_u32 v3, s8, v2 -; GFX6-NEXT: s_ashr_i32 s8, s5, 31 -; GFX6-NEXT: s_add_i32 s5, s5, s8 -; GFX6-NEXT: s_xor_b32 s5, s5, s8 ; GFX6-NEXT: v_mul_hi_u32 v1, v2, v3 -; GFX6-NEXT: s_xor_b32 s4, s8, s4 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX6-NEXT: v_readfirstlane_b32 s6, v1 -; GFX6-NEXT: s_mul_i32 s6, s6, s7 -; GFX6-NEXT: s_sub_i32 s5, s5, s6 -; GFX6-NEXT: s_sub_i32 s6, s5, s7 +; GFX6-NEXT: v_readfirstlane_b32 s4, v1 +; GFX6-NEXT: s_mul_i32 s4, s4, s6 +; GFX6-NEXT: s_sub_i32 s4, s5, s4 +; GFX6-NEXT: s_sub_i32 s5, s4, s6 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v1 -; GFX6-NEXT: s_cmp_ge_u32 s5, s7 +; GFX6-NEXT: s_cmp_ge_u32 s4, s6 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX6-NEXT: s_cselect_b32 s5, s6, s5 +; GFX6-NEXT: s_cselect_b32 s4, s5, s4 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v1 -; GFX6-NEXT: s_cmp_ge_u32 s5, s7 +; GFX6-NEXT: s_cmp_ge_u32 s4, s6 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX6-NEXT: v_xor_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s4, v1 +; GFX6-NEXT: v_xor_b32_e32 v1, s7, v1 +; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s7, v1 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -6613,65 +6549,59 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s6 -; GFX9-NEXT: s_ashr_i32 s3, s2, 31 -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: s_xor_b32 s2, s2, s3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX9-NEXT: s_abs_i32 s3, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s7 -; GFX9-NEXT: s_ashr_i32 s7, s4, 31 -; GFX9-NEXT: s_add_i32 s4, s4, s7 +; GFX9-NEXT: s_abs_i32 s7, s4 +; GFX9-NEXT: s_xor_b32 s2, s4, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s3, s7, s3 -; GFX9-NEXT: s_xor_b32 s4, s4, s7 -; GFX9-NEXT: s_sub_i32 s7, 0, s2 +; GFX9-NEXT: s_sub_i32 s4, 0, s3 +; GFX9-NEXT: s_ashr_i32 s2, s2, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s8, v0 -; GFX9-NEXT: s_mul_i32 s7, s7, s8 -; GFX9-NEXT: s_mul_hi_u32 s7, s8, s7 -; GFX9-NEXT: s_add_i32 s8, s8, s7 -; GFX9-NEXT: s_mul_hi_u32 s7, s4, s8 -; GFX9-NEXT: s_mul_i32 s8, s7, s2 -; GFX9-NEXT: s_sub_i32 s4, s4, s8 -; GFX9-NEXT: s_add_i32 s9, s7, 1 -; GFX9-NEXT: s_sub_i32 s8, s4, s2 -; GFX9-NEXT: s_cmp_ge_u32 s4, s2 -; GFX9-NEXT: s_cselect_b32 s7, s9, s7 -; GFX9-NEXT: s_cselect_b32 s4, s8, s4 -; GFX9-NEXT: s_add_i32 s8, s7, 1 -; GFX9-NEXT: s_cmp_ge_u32 s4, s2 -; GFX9-NEXT: s_cselect_b32 s2, s8, s7 -; GFX9-NEXT: s_ashr_i32 s4, s6, 31 -; GFX9-NEXT: s_add_i32 s6, s6, s4 -; GFX9-NEXT: s_xor_b32 s6, s6, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX9-NEXT: s_ashr_i32 s7, s5, 31 -; GFX9-NEXT: s_xor_b32 s2, s2, s3 -; GFX9-NEXT: s_add_i32 s5, s5, s7 +; GFX9-NEXT: s_mul_i32 s4, s4, s8 +; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4 +; GFX9-NEXT: s_add_i32 s8, s8, s4 +; GFX9-NEXT: s_mul_hi_u32 s4, s7, s8 +; GFX9-NEXT: s_mul_i32 s8, s4, s3 +; GFX9-NEXT: s_sub_i32 s7, s7, s8 +; GFX9-NEXT: s_add_i32 s9, s4, 1 +; GFX9-NEXT: s_sub_i32 s8, s7, s3 +; GFX9-NEXT: s_cmp_ge_u32 s7, s3 +; GFX9-NEXT: s_cselect_b32 s4, s9, s4 +; GFX9-NEXT: s_cselect_b32 s7, s8, s7 +; GFX9-NEXT: s_add_i32 s8, s4, 1 +; GFX9-NEXT: s_cmp_ge_u32 s7, s3 +; GFX9-NEXT: s_cselect_b32 s3, s8, s4 +; GFX9-NEXT: s_abs_i32 s4, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX9-NEXT: s_xor_b32 s3, s3, s2 +; GFX9-NEXT: s_sub_i32 s7, 0, s4 +; GFX9-NEXT: s_sub_i32 s2, s3, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s4, s7, s4 -; GFX9-NEXT: s_sub_i32 s2, s2, s3 -; GFX9-NEXT: s_xor_b32 s3, s5, s7 +; GFX9-NEXT: s_xor_b32 s6, s5, s6 +; GFX9-NEXT: s_abs_i32 s5, s5 +; GFX9-NEXT: s_ashr_i32 s6, s6, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s5, 0, s6 -; GFX9-NEXT: v_readfirstlane_b32 s7, v0 -; GFX9-NEXT: s_mul_i32 s5, s5, s7 -; GFX9-NEXT: s_mul_hi_u32 s5, s7, s5 -; GFX9-NEXT: s_add_i32 s7, s7, s5 -; GFX9-NEXT: s_mul_hi_u32 s5, s3, s7 -; GFX9-NEXT: s_mul_i32 s7, s5, s6 -; GFX9-NEXT: s_sub_i32 s3, s3, s7 -; GFX9-NEXT: s_add_i32 s8, s5, 1 -; GFX9-NEXT: s_sub_i32 s7, s3, s6 -; GFX9-NEXT: s_cmp_ge_u32 s3, s6 -; GFX9-NEXT: s_cselect_b32 s5, s8, s5 +; GFX9-NEXT: v_readfirstlane_b32 s3, v0 +; GFX9-NEXT: s_mul_i32 s7, s7, s3 +; GFX9-NEXT: s_mul_hi_u32 s7, s3, s7 +; GFX9-NEXT: s_add_i32 s3, s3, s7 +; GFX9-NEXT: s_mul_hi_u32 s3, s5, s3 +; GFX9-NEXT: s_mul_i32 s7, s3, s4 +; GFX9-NEXT: s_sub_i32 s5, s5, s7 +; GFX9-NEXT: s_add_i32 s8, s3, 1 +; GFX9-NEXT: s_sub_i32 s7, s5, s4 +; GFX9-NEXT: s_cmp_ge_u32 s5, s4 +; GFX9-NEXT: s_cselect_b32 s3, s8, s3 +; GFX9-NEXT: s_cselect_b32 s5, s7, s5 +; GFX9-NEXT: s_add_i32 s7, s3, 1 +; GFX9-NEXT: s_cmp_ge_u32 s5, s4 ; GFX9-NEXT: s_cselect_b32 s3, s7, s3 -; GFX9-NEXT: s_add_i32 s7, s5, 1 -; GFX9-NEXT: s_cmp_ge_u32 s3, s6 -; GFX9-NEXT: s_cselect_b32 s3, s7, s5 -; GFX9-NEXT: s_xor_b32 s3, s3, s4 -; GFX9-NEXT: s_sub_i32 s3, s3, s4 +; GFX9-NEXT: s_xor_b32 s3, s3, s6 +; GFX9-NEXT: s_sub_i32 s3, s3, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -7001,19 +6931,16 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s6 -; GFX6-NEXT: s_ashr_i32 s3, s2, 31 -; GFX6-NEXT: s_add_i32 s2, s2, s3 -; GFX6-NEXT: s_xor_b32 s2, s2, s3 +; GFX6-NEXT: s_abs_i32 s2, s2 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX6-NEXT: s_sub_i32 s3, 0, s2 -; GFX6-NEXT: s_ashr_i32 s6, s4, 31 +; GFX6-NEXT: s_lshl_b32 s6, 0x1000, s7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 -; GFX6-NEXT: s_add_i32 s3, s4, s6 -; GFX6-NEXT: s_xor_b32 s3, s3, s6 -; GFX6-NEXT: s_lshl_b32 s4, 0x1000, s7 +; GFX6-NEXT: s_abs_i32 s3, s4 +; GFX6-NEXT: s_ashr_i32 s4, s4, 31 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 @@ -7026,38 +6953,35 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_sub_i32 s7, s3, s2 ; GFX6-NEXT: s_cmp_ge_u32 s3, s2 ; GFX6-NEXT: s_cselect_b32 s7, s7, s3 -; GFX6-NEXT: s_ashr_i32 s2, s4, 31 -; GFX6-NEXT: s_add_i32 s4, s4, s2 -; GFX6-NEXT: s_xor_b32 s4, s4, s2 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX6-NEXT: s_sub_i32 s2, 0, s4 -; GFX6-NEXT: s_ashr_i32 s8, s5, 31 -; GFX6-NEXT: s_xor_b32 s7, s7, s6 +; GFX6-NEXT: s_abs_i32 s6, s6 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX6-NEXT: s_sub_i32 s2, 0, s6 +; GFX6-NEXT: s_abs_i32 s8, s5 +; GFX6-NEXT: s_xor_b32 s7, s7, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: s_sub_i32 s6, s7, s6 +; GFX6-NEXT: s_sub_i32 s4, s7, s4 +; GFX6-NEXT: s_ashr_i32 s5, s5, 31 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 -; GFX6-NEXT: s_add_i32 s2, s5, s8 -; GFX6-NEXT: s_xor_b32 s5, s2, s8 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s7, v0 -; GFX6-NEXT: s_mul_i32 s7, s7, s4 -; GFX6-NEXT: s_sub_i32 s5, s5, s7 -; GFX6-NEXT: s_sub_i32 s7, s5, s4 -; GFX6-NEXT: s_cmp_ge_u32 s5, s4 -; GFX6-NEXT: s_cselect_b32 s5, s7, s5 -; GFX6-NEXT: s_sub_i32 s7, s5, s4 -; GFX6-NEXT: s_cmp_ge_u32 s5, s4 -; GFX6-NEXT: s_cselect_b32 s4, s7, s5 -; GFX6-NEXT: s_xor_b32 s4, s4, s8 -; GFX6-NEXT: s_sub_i32 s4, s4, s8 -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_mul_i32 s7, s7, s6 +; GFX6-NEXT: s_sub_i32 s7, s8, s7 +; GFX6-NEXT: s_sub_i32 s8, s7, s6 +; GFX6-NEXT: s_cmp_ge_u32 s7, s6 +; GFX6-NEXT: s_cselect_b32 s7, s8, s7 +; GFX6-NEXT: s_sub_i32 s8, s7, s6 +; GFX6-NEXT: s_cmp_ge_u32 s7, s6 +; GFX6-NEXT: s_cselect_b32 s6, s8, s7 +; GFX6-NEXT: s_xor_b32 s6, s6, s5 +; GFX6-NEXT: s_sub_i32 s5, s6, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -7068,16 +6992,13 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s6 -; GFX9-NEXT: s_ashr_i32 s3, s2, 31 -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: s_xor_b32 s2, s2, s3 +; GFX9-NEXT: s_abs_i32 s2, s2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s7 ; GFX9-NEXT: s_sub_i32 s7, 0, s2 ; GFX9-NEXT: s_ashr_i32 s6, s4, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_add_i32 s4, s4, s6 -; GFX9-NEXT: s_xor_b32 s4, s4, s6 +; GFX9-NEXT: s_abs_i32 s4, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s8, v0 @@ -7093,24 +7014,21 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_sub_i32 s7, s4, s2 ; GFX9-NEXT: s_cmp_ge_u32 s4, s2 ; GFX9-NEXT: s_cselect_b32 s2, s7, s4 -; GFX9-NEXT: s_ashr_i32 s4, s3, 31 -; GFX9-NEXT: s_add_i32 s3, s3, s4 -; GFX9-NEXT: s_xor_b32 s3, s3, s4 +; GFX9-NEXT: s_abs_i32 s3, s3 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX9-NEXT: s_xor_b32 s2, s2, s6 +; GFX9-NEXT: s_sub_i32 s7, 0, s3 ; GFX9-NEXT: s_sub_i32 s2, s2, s6 -; GFX9-NEXT: s_sub_i32 s6, 0, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_ashr_i32 s4, s5, 31 -; GFX9-NEXT: s_add_i32 s5, s5, s4 -; GFX9-NEXT: s_xor_b32 s5, s5, s4 +; GFX9-NEXT: s_abs_i32 s5, s5 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s7, v0 -; GFX9-NEXT: s_mul_i32 s6, s6, s7 -; GFX9-NEXT: s_mul_hi_u32 s6, s7, s6 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_mul_hi_u32 s6, s5, s7 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: s_mul_i32 s7, s7, s6 +; GFX9-NEXT: s_mul_hi_u32 s7, s6, s7 +; GFX9-NEXT: s_add_i32 s6, s6, s7 +; GFX9-NEXT: s_mul_hi_u32 s6, s5, s6 ; GFX9-NEXT: s_mul_i32 s6, s6, s3 ; GFX9-NEXT: s_sub_i32 s5, s5, s6 ; GFX9-NEXT: s_sub_i32 s6, s5, s3 diff --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll index 4d8687b141a79..5bbea7ecf3f2d 100644 --- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll +++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll @@ -575,34 +575,33 @@ define i32 @sdiv32(i32 %a, i32 %b) { ; GFX9-LABEL: sdiv32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 -; GFX9-NEXT: v_xor_b32_e32 v1, v1, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v1 -; GFX9-NEXT: v_sub_u32_e32 v4, 0, v1 -; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v0 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v5 +; GFX9-NEXT: v_sub_u32_e32 v2, 0, v1 +; GFX9-NEXT: v_max_i32_e32 v2, v1, v2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v2 +; GFX9-NEXT: v_sub_u32_e32 v4, 0, v2 +; GFX9-NEXT: v_sub_u32_e32 v5, 0, v0 +; GFX9-NEXT: v_max_i32_e32 v5, v0, v5 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX9-NEXT: v_xor_b32_e32 v0, v0, v5 -; GFX9-NEXT: v_xor_b32_e32 v2, v5, v2 +; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v0 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX9-NEXT: v_mul_lo_u32 v4, v4, v3 ; GFX9-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, v3 -; GFX9-NEXT: v_mul_lo_u32 v4, v3, v1 -; GFX9-NEXT: v_add_u32_e32 v5, 1, v3 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v4 -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 -; GFX9-NEXT: v_sub_u32_e32 v4, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_add_u32_e32 v4, 1, v3 -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; GFX9-NEXT: v_xor_b32_e32 v0, v0, v2 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v3, v5, v3 +; GFX9-NEXT: v_mul_lo_u32 v4, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 1, v3 +; GFX9-NEXT: v_sub_u32_e32 v4, v5, v4 +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX9-NEXT: v_sub_u32_e32 v3, v4, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX9-NEXT: v_xor_b32_e32 v1, v1, v0 +; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %d = sdiv i32 %a, %b ret i32 %d @@ -640,31 +639,30 @@ define i32 @srem32(i32 %a, i32 %b) { ; GFX9-LABEL: srem32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 -; GFX9-NEXT: v_xor_b32_e32 v1, v1, v2 +; GFX9-NEXT: v_sub_u32_e32 v2, 0, v1 +; GFX9-NEXT: v_max_i32_e32 v1, v1, v2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v1 ; GFX9-NEXT: v_sub_u32_e32 v3, 0, v1 -; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v0 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v4 +; GFX9-NEXT: v_sub_u32_e32 v4, 0, v0 +; GFX9-NEXT: v_max_i32_e32 v4, v0, v4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX9-NEXT: v_xor_b32_e32 v0, v0, v4 +; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v0 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_mul_lo_u32 v3, v3, v2 ; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v2, v4, v2 ; GFX9-NEXT: v_mul_lo_u32 v2, v2, v1 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_sub_u32_e32 v2, v0, v1 -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_sub_u32_e32 v2, v0, v1 -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_xor_b32_e32 v0, v0, v4 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v4 +; GFX9-NEXT: v_sub_u32_e32 v2, v4, v2 +; GFX9-NEXT: v_sub_u32_e32 v3, v2, v1 +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v2, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX9-NEXT: v_sub_u32_e32 v3, v2, v1 +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v2, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_xor_b32_e32 v1, v1, v0 +; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %d = srem i32 %a, %b ret i32 %d diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index 7d8eba1e87080..f0ab3a5342e01 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -9,26 +9,28 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-LABEL: v_sdiv_i128_vv: ; GFX9: ; %bb.0: ; %_udiv-special-cases ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, 0, v0 +; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, 0, v1, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, 0, v2, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v3, vcc +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] ; GFX9-NEXT: v_ashrrev_i32_e32 v16, 31, v3 -; GFX9-NEXT: v_xor_b32_e32 v0, v16, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, v16, v1 -; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v0, v16 -; GFX9-NEXT: v_xor_b32_e32 v2, v16, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v1, v16, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v9, v1, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v8, v0, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v11, v3, v11, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v10, v2, v10, vcc +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, 0, v4 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, 0, v5, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, 0, v6, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, 0, v7, vcc +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[6:7] ; GFX9-NEXT: v_ashrrev_i32_e32 v17, 31, v7 -; GFX9-NEXT: v_xor_b32_e32 v3, v16, v3 -; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, v2, v16, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, v3, v16, vcc -; GFX9-NEXT: v_xor_b32_e32 v3, v17, v4 -; GFX9-NEXT: v_xor_b32_e32 v2, v17, v5 -; GFX9-NEXT: v_sub_co_u32_e32 v20, vcc, v3, v17 -; GFX9-NEXT: v_xor_b32_e32 v0, v17, v6 -; GFX9-NEXT: v_subb_co_u32_e32 v21, vcc, v2, v17, vcc -; GFX9-NEXT: v_xor_b32_e32 v1, v17, v7 -; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v17, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v17, vcc -; GFX9-NEXT: v_or_b32_e32 v3, v21, v1 -; GFX9-NEXT: v_or_b32_e32 v2, v20, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v5, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v21, v4, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: v_or_b32_e32 v3, v20, v1 +; GFX9-NEXT: v_or_b32_e32 v2, v21, v0 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GFX9-NEXT: v_or_b32_e32 v3, v9, v11 ; GFX9-NEXT: v_or_b32_e32 v2, v8, v10 @@ -37,35 +39,35 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_add_u32_e32 v2, 32, v2 ; GFX9-NEXT: v_ffbh_u32_e32 v3, v1 ; GFX9-NEXT: v_min_u32_e32 v2, v2, v3 -; GFX9-NEXT: v_ffbh_u32_e32 v3, v20 +; GFX9-NEXT: v_ffbh_u32_e32 v3, v21 ; GFX9-NEXT: v_add_u32_e32 v3, 32, v3 -; GFX9-NEXT: v_ffbh_u32_e32 v4, v21 +; GFX9-NEXT: v_ffbh_u32_e32 v4, v20 ; GFX9-NEXT: v_min_u32_e32 v3, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, 64, v3 ; GFX9-NEXT: v_addc_co_u32_e64 v4, s[6:7], 0, 0, vcc ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX9-NEXT: v_ffbh_u32_e32 v5, v11 +; GFX9-NEXT: v_ffbh_u32_e32 v6, v11 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX9-NEXT: v_ffbh_u32_e32 v3, v10 ; GFX9-NEXT: v_add_u32_e32 v3, 32, v3 -; GFX9-NEXT: v_min_u32_e32 v3, v3, v5 -; GFX9-NEXT: v_ffbh_u32_e32 v5, v8 -; GFX9-NEXT: v_add_u32_e32 v5, 32, v5 -; GFX9-NEXT: v_ffbh_u32_e32 v6, v9 -; GFX9-NEXT: v_min_u32_e32 v5, v5, v6 +; GFX9-NEXT: v_min_u32_e32 v3, v3, v6 +; GFX9-NEXT: v_ffbh_u32_e32 v6, v8 +; GFX9-NEXT: v_add_u32_e32 v6, 32, v6 +; GFX9-NEXT: v_ffbh_u32_e32 v7, v9 +; GFX9-NEXT: v_min_u32_e32 v6, v6, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, 64, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[6:7], 0, 0, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 64, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v7, s[6:7], 0, 0, vcc ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] -; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc -; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v6, vcc ; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, 0, vcc +; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3 +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v7, vcc ; GFX9-NEXT: v_subbrev_co_u32_e32 v4, vcc, 0, v5, vcc ; GFX9-NEXT: v_subbrev_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v18, v16 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc @@ -138,8 +140,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_cndmask_b32_e64 v8, v12, v8, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v11, 0, v7, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v6, vcc -; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, -1, v20 -; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, -1, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, -1, v21 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, -1, v20, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v28, vcc, -1, v0, vcc ; GFX9-NEXT: v_mov_b32_e32 v14, 0 ; GFX9-NEXT: v_mov_b32_e32 v12, 0 @@ -164,9 +166,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_subb_co_u32_e32 v14, vcc, v28, v10, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v14, vcc, v29, v11, vcc ; GFX9-NEXT: v_ashrrev_i32_e32 v30, 31, v14 -; GFX9-NEXT: v_and_b32_e32 v14, v30, v20 -; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v8, v14 ; GFX9-NEXT: v_and_b32_e32 v14, v30, v21 +; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v8, v14 +; GFX9-NEXT: v_and_b32_e32 v14, v30, v20 ; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v9, v14, vcc ; GFX9-NEXT: v_and_b32_e32 v14, v30, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, v10, v14, vcc @@ -218,238 +220,244 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0: ; %bb.0: ; %_udiv-special-cases ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v21, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v4 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v2 +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 +; GFX9-O0-NEXT: ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v22, v7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v3 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: s_waitcnt vmcnt(4) -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v2 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GFX9-O0-NEXT: s_mov_b32 s4, 63 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v4 -; GFX9-O0-NEXT: v_ashrrev_i64 v[13:14], s4, v[11:12] -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v2 -; GFX9-O0-NEXT: v_ashrrev_i64 v[11:12], s4, v[11:12] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-O0-NEXT: v_xor_b32_e64 v1, v6, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v13 -; GFX9-O0-NEXT: v_xor_b32_e64 v13, v4, v5 -; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v10 -; GFX9-O0-NEXT: v_xor_b32_e64 v1, v6, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: v_xor_b32_e64 v15, v4, v5 -; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v14 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v9, vcc, v9, v4 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v6, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v13, vcc, v10, v4, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v11 +; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 0 +; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 1 +; GFX9-O0-NEXT: s_mov_b32 s10, s6 +; GFX9-O0-NEXT: v_writelane_b32 v0, s10, 2 +; GFX9-O0-NEXT: s_mov_b32 s11, s7 +; GFX9-O0-NEXT: v_writelane_b32 v0, s11, 3 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v6, vcc, s10, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s11 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v3, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v14, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s11 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v20, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v7 +; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX9-O0-NEXT: v_cmp_lt_i64_e64 s[4:5], v[10:11], s[4:5] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5] +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v20, v1, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v14, v1, s[4:5] ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-O0-NEXT: v_xor_b32_e64 v1, v5, v1 -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 -; GFX9-O0-NEXT: v_xor_b32_e64 v11, v3, v2 -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v8 -; GFX9-O0-NEXT: v_xor_b32_e64 v1, v5, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v7 -; GFX9-O0-NEXT: v_xor_b32_e64 v7, v3, v2 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v12 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v3 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v8, vcc, v8, v5, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v7, v3, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v2, v5, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 killed $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v21 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v22 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v18, vcc, s10, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v8, v9, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v8, s10 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v8, v13, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v8, vcc, v8, v15, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 +; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v19 +; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX9-O0-NEXT: v_cmp_lt_i64_e64 s[4:5], v[21:22], s[4:5] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v10, v9, v10, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v18 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[4:5] +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v10 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v12 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, v15, v8, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v13, v8, s[4:5] ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v7 -; GFX9-O0-NEXT: v_xor_b32_e64 v5, v5, v6 -; GFX9-O0-NEXT: v_xor_b32_e64 v3, v3, v4 -; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v9 +; GFX9-O0-NEXT: v_xor_b32_e64 v15, v15, v20 +; GFX9-O0-NEXT: v_xor_b32_e64 v13, v13, v14 +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15 +; GFX9-O0-NEXT: s_mov_b32 s4, 63 +; GFX9-O0-NEXT: v_ashrrev_i64 v[13:14], s4, v[13:14] +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v19 +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v5 +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v17 +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-O0-NEXT: v_or_b32_e64 v3, v8, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O0-NEXT: v_or_b32_e64 v1, v5, v6 -; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 -; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 0 -; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 1 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[1:2], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 -; GFX9-O0-NEXT: v_or_b32_e64 v15, v4, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 -; GFX9-O0-NEXT: v_or_b32_e64 v9, v3, v1 -; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v15 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[9:10], s[6:7] +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v19 +; GFX9-O0-NEXT: v_or_b32_e64 v15, v13, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v18 +; GFX9-O0-NEXT: v_or_b32_e64 v13, v13, v14 +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[13:14], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 +; GFX9-O0-NEXT: v_or_b32_e64 v15, v13, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v16 +; GFX9-O0-NEXT: v_or_b32_e64 v13, v13, v14 +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[13:14], s[6:7] ; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v6 -; GFX9-O0-NEXT: s_mov_b32 s9, 32 -; GFX9-O0-NEXT: v_add_u32_e64 v6, v6, s9 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v7, v7 -; GFX9-O0-NEXT: v_min_u32_e64 v6, v6, v7 -; GFX9-O0-NEXT: s_mov_b32 s8, 0 -; GFX9-O0-NEXT: ; implicit-def: $sgpr10 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s8 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v5 -; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s9 +; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[11:12], s[8:9] ; GFX9-O0-NEXT: v_ffbh_u32_e64 v8, v8 -; GFX9-O0-NEXT: v_min_u32_e64 v15, v5, v8 -; GFX9-O0-NEXT: ; implicit-def: $sgpr10 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s8 -; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v5 -; GFX9-O0-NEXT: s_mov_b64 s[10:11], 64 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v15 -; GFX9-O0-NEXT: s_mov_b32 s12, s10 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v16 -; GFX9-O0-NEXT: s_mov_b32 s14, s11 -; GFX9-O0-NEXT: v_add_co_u32_e64 v8, s[12:13], v8, s12 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s14 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[12:13], v5, v9, s[12:13] +; GFX9-O0-NEXT: s_mov_b32 s13, 32 +; GFX9-O0-NEXT: v_add_u32_e64 v8, v8, s13 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v9, v9 +; GFX9-O0-NEXT: v_min_u32_e64 v8, v8, v9 +; GFX9-O0-NEXT: s_mov_b32 s12, 0 +; GFX9-O0-NEXT: ; implicit-def: $sgpr14 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s12 ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[11:12], s[12:13] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[12:13] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, v6, v7, s[12:13] -; GFX9-O0-NEXT: ; implicit-def: $sgpr12 -; GFX9-O0-NEXT: ; implicit-def: $sgpr12 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v9 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v7, v7 +; GFX9-O0-NEXT: v_add_u32_e64 v7, v7, s13 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v10 +; GFX9-O0-NEXT: v_min_u32_e64 v13, v7, v10 +; GFX9-O0-NEXT: ; implicit-def: $sgpr14 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, s12 +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v7 +; GFX9-O0-NEXT: s_mov_b64 s[14:15], 64 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v13 +; GFX9-O0-NEXT: s_mov_b32 s16, s14 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-O0-NEXT: s_mov_b32 s18, s15 +; GFX9-O0-NEXT: v_add_co_u32_e64 v10, s[16:17], v10, s16 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s18 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v7, s[16:17], v7, v11, s[16:17] +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v10 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, v8, v9, s[8:9] +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v7 +; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[5:6], s[8:9] ; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v1 -; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s9 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s13 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v2 ; GFX9-O0-NEXT: v_min_u32_e64 v6, v5, v6 -; GFX9-O0-NEXT: ; implicit-def: $sgpr12 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr16 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s12 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v3 -; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s9 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s13 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v11, v4 -; GFX9-O0-NEXT: v_min_u32_e64 v15, v5, v11 -; GFX9-O0-NEXT: ; implicit-def: $sgpr9 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s8 -; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v15 -; GFX9-O0-NEXT: s_mov_b32 s8, s10 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v16 -; GFX9-O0-NEXT: s_mov_b32 s10, s11 -; GFX9-O0-NEXT: v_add_co_u32_e64 v11, s[8:9], v11, s8 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, s10 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[8:9], v5, v12, s[8:9] +; GFX9-O0-NEXT: v_min_u32_e64 v12, v5, v11 +; GFX9-O0-NEXT: ; implicit-def: $sgpr13 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s12 +; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v12 +; GFX9-O0-NEXT: s_mov_b32 s12, s14 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v13 +; GFX9-O0-NEXT: s_mov_b32 s14, s15 +; GFX9-O0-NEXT: v_add_co_u32_e64 v11, s[12:13], v11, s12 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s14 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[12:13], v5, v12, s[12:13] ; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[13:14], s[8:9] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 @@ -462,8 +470,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 -; GFX9-O0-NEXT: s_mov_b32 s10, s6 -; GFX9-O0-NEXT: s_mov_b32 s11, s7 ; GFX9-O0-NEXT: v_sub_co_u32_e32 v5, vcc, v5, v8 ; GFX9-O0-NEXT: v_subb_co_u32_e32 v9, vcc, v6, v7, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v7, s10 @@ -546,75 +552,75 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[4:5], exec -; GFX9-O0-NEXT: v_writelane_b32 v0, s4, 2 -; GFX9-O0-NEXT: v_writelane_b32 v0, s5, 3 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: v_writelane_b32 v0, s4, 4 +; GFX9-O0-NEXT: v_writelane_b32 v0, s5, 5 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-O0-NEXT: s_branch .LBB0_8 ; GFX9-O0-NEXT: .LBB0_1: ; %Flow -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v0, 4 -; GFX9-O0-NEXT: v_readlane_b32 s5, v0, 5 +; GFX9-O0-NEXT: v_readlane_b32 s4, v0, 6 +; GFX9-O0-NEXT: v_readlane_b32 s5, v0, 7 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: ; %bb.2: ; %Flow -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(6) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_5 ; GFX9-O0-NEXT: .LBB0_3: ; %Flow2 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v4, 2 -; GFX9-O0-NEXT: v_readlane_b32 s5, v4, 3 +; GFX9-O0-NEXT: v_readlane_b32 s4, v4, 4 +; GFX9-O0-NEXT: v_readlane_b32 s5, v4, 5 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_9 ; GFX9-O0-NEXT: .LBB0_4: ; %udiv-loop-exit -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 1 ; GFX9-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[0:1] @@ -647,67 +653,67 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_3 ; GFX9-O0-NEXT: .LBB0_5: ; %Flow1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 ; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v8, 6 -; GFX9-O0-NEXT: v_readlane_b32 s5, v8, 7 +; GFX9-O0-NEXT: v_readlane_b32 s4, v8, 8 +; GFX9-O0-NEXT: v_readlane_b32 s5, v8, 9 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_4 ; GFX9-O0-NEXT: .LBB0_6: ; %udiv-do-while ; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 ; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s6, v16, 8 -; GFX9-O0-NEXT: v_readlane_b32 s7, v16, 9 -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_readlane_b32 s6, v16, 10 +; GFX9-O0-NEXT: v_readlane_b32 s7, v16, 11 +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 63 ; GFX9-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-O0-NEXT: v_lshrrev_b64 v[29:30], s4, v[2:3] @@ -847,72 +853,72 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v18, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v2 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v18, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v18, v15 ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v14 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v18, v13 ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v12 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 4 -; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 5 +; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 6 +; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 7 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 8 -; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 9 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 10 +; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 11 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 ; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execnz .LBB0_6 ; GFX9-O0-NEXT: s_branch .LBB0_1 ; GFX9-O0-NEXT: .LBB0_7: ; %udiv-preheader -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 ; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload @@ -993,51 +999,51 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 ; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v15, s9 ; GFX9-O0-NEXT: v_mov_b32_e32 v14, s8 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, s7 ; GFX9-O0-NEXT: v_mov_b32_e32 v12, s6 -; GFX9-O0-NEXT: v_writelane_b32 v16, s4, 8 -; GFX9-O0-NEXT: v_writelane_b32 v16, s5, 9 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: v_writelane_b32 v16, s4, 10 +; GFX9-O0-NEXT: v_writelane_b32 v16, s5, 11 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 ; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_6 ; GFX9-O0-NEXT: .LBB0_8: ; %udiv-bb1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload @@ -1074,14 +1080,14 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b32 s4, 0x7f ; GFX9-O0-NEXT: v_sub_u32_e64 v3, s4, v4 ; GFX9-O0-NEXT: v_lshlrev_b64 v[5:6], v3, v[11:12] @@ -1127,12 +1133,12 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v3 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 ; GFX9-O0-NEXT: v_or_b32_e64 v3, v3, v4 @@ -1147,39 +1153,39 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s9 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec ; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] -; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 6 -; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 7 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 8 +; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 9 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execz .LBB0_5 ; GFX9-O0-NEXT: s_branch .LBB0_7 ; GFX9-O0-NEXT: .LBB0_9: ; %udiv-end -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) @@ -1224,11 +1230,11 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 ; GFX9-O0-NEXT: ; kill: killed $vgpr4 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll index 16a03badcb132..e04cd71125608 100644 --- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll @@ -6,185 +6,187 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-LABEL: v_sdiv_v2i128_vv: ; SDAG: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_ashrrev_i32_e32 v24, 31, v3 -; SDAG-NEXT: v_ashrrev_i32_e32 v25, 31, v11 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: v_ashrrev_i32_e32 v26, 31, v3 +; SDAG-NEXT: v_ashrrev_i32_e32 v27, 31, v11 +; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 0, v0 +; SDAG-NEXT: v_mov_b32_e32 v19, 0 ; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f -; SDAG-NEXT: v_mov_b32_e32 v26, v24 -; SDAG-NEXT: v_mov_b32_e32 v27, v25 -; SDAG-NEXT: v_xor_b32_e32 v17, v24, v3 -; SDAG-NEXT: v_xor_b32_e32 v18, v24, v2 -; SDAG-NEXT: v_xor_b32_e32 v1, v24, v1 -; SDAG-NEXT: v_xor_b32_e32 v0, v24, v0 -; SDAG-NEXT: v_xor_b32_e32 v19, v25, v11 -; SDAG-NEXT: v_xor_b32_e32 v20, v25, v10 -; SDAG-NEXT: v_xor_b32_e32 v9, v25, v9 -; SDAG-NEXT: v_xor_b32_e32 v8, v25, v8 -; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v0, v24 -; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v1, v24, vcc -; SDAG-NEXT: v_ffbh_u32_e32 v0, v2 -; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v18, v24, vcc -; SDAG-NEXT: v_add_i32_e64 v1, s[4:5], 32, v0 -; SDAG-NEXT: v_ffbh_u32_e32 v18, v3 -; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v17, v24, vcc -; SDAG-NEXT: v_or_b32_e32 v0, v2, v10 -; SDAG-NEXT: v_ffbh_u32_e32 v17, v10 -; SDAG-NEXT: v_min_u32_e32 v18, v1, v18 -; SDAG-NEXT: v_sub_i32_e32 v28, vcc, v8, v25 -; SDAG-NEXT: v_or_b32_e32 v1, v3, v11 -; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], 32, v17 -; SDAG-NEXT: v_ffbh_u32_e32 v17, v11 -; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], 64, v18 -; SDAG-NEXT: v_addc_u32_e64 v21, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v29, vcc, v9, v25, vcc -; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; SDAG-NEXT: v_ffbh_u32_e32 v1, v28 -; SDAG-NEXT: v_min_u32_e32 v8, v8, v17 -; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[10:11] -; SDAG-NEXT: v_cndmask_b32_e64 v17, v21, 0, s[6:7] -; SDAG-NEXT: v_subb_u32_e32 v0, vcc, v20, v25, vcc -; SDAG-NEXT: v_add_i32_e64 v9, s[8:9], 32, v1 -; SDAG-NEXT: v_ffbh_u32_e32 v20, v29 -; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v8, s[6:7] -; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v19, v25, vcc -; SDAG-NEXT: v_or_b32_e32 v8, v28, v0 -; SDAG-NEXT: v_ffbh_u32_e32 v19, v0 -; SDAG-NEXT: v_min_u32_e32 v20, v9, v20 -; SDAG-NEXT: v_or_b32_e32 v9, v29, v1 -; SDAG-NEXT: v_add_i32_e32 v19, vcc, 32, v19 +; SDAG-NEXT: v_mov_b32_e32 v28, v26 +; SDAG-NEXT: v_mov_b32_e32 v29, v27 +; SDAG-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc +; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v2, vcc +; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] +; SDAG-NEXT: v_cndmask_b32_e64 v17, v1, v17, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v0, v16, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v3, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5] +; SDAG-NEXT: v_ffbh_u32_e32 v1, v16 +; SDAG-NEXT: v_ffbh_u32_e32 v18, v17 +; SDAG-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[4:5] +; SDAG-NEXT: v_sub_i32_e32 v20, vcc, 0, v8 +; SDAG-NEXT: v_or_b32_e32 v0, v16, v2 +; SDAG-NEXT: v_ffbh_u32_e32 v21, v2 +; SDAG-NEXT: v_add_i32_e64 v22, s[4:5], 32, v1 +; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v9, vcc +; SDAG-NEXT: v_or_b32_e32 v1, v17, v3 +; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], 32, v21 +; SDAG-NEXT: v_min_u32_e32 v18, v22, v18 +; SDAG-NEXT: v_ffbh_u32_e32 v22, v3 +; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[10:11] +; SDAG-NEXT: v_cndmask_b32_e64 v30, v9, v23, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v10, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v31, v8, v20, s[4:5] +; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[0:1] +; SDAG-NEXT: v_min_u32_e32 v1, v21, v22 +; SDAG-NEXT: v_add_i32_e64 v8, s[8:9], 64, v18 +; SDAG-NEXT: v_addc_u32_e64 v18, s[8:9], 0, 0, s[8:9] +; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v11, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v0, v10, v9, s[4:5] +; SDAG-NEXT: v_ffbh_u32_e32 v9, v31 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; SDAG-NEXT: v_cndmask_b32_e64 v10, v18, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v18, v8, v1, vcc +; SDAG-NEXT: v_ffbh_u32_e32 v21, v30 +; SDAG-NEXT: v_cndmask_b32_e64 v1, v11, v20, s[4:5] +; SDAG-NEXT: v_or_b32_e32 v8, v31, v0 +; SDAG-NEXT: v_ffbh_u32_e32 v11, v0 +; SDAG-NEXT: v_add_i32_e32 v20, vcc, 32, v9 +; SDAG-NEXT: v_or_b32_e32 v9, v30, v1 +; SDAG-NEXT: v_add_i32_e32 v11, vcc, 32, v11 +; SDAG-NEXT: v_min_u32_e32 v20, v20, v21 ; SDAG-NEXT: v_ffbh_u32_e32 v21, v1 -; SDAG-NEXT: v_add_i32_e32 v20, vcc, 64, v20 -; SDAG-NEXT: v_addc_u32_e64 v22, s[6:7], 0, 0, vcc ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; SDAG-NEXT: v_min_u32_e32 v8, v19, v21 -; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v9, v22, 0, s[6:7] -; SDAG-NEXT: s_or_b64 s[8:9], vcc, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v8, v20, v8, s[6:7] +; SDAG-NEXT: v_min_u32_e32 v8, v11, v21 +; SDAG-NEXT: v_add_i32_e64 v9, s[4:5], 64, v20 +; SDAG-NEXT: v_addc_u32_e64 v11, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v11, v11, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc ; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v8, v18 -; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v9, v17, vcc -; SDAG-NEXT: v_xor_b32_e32 v17, 0x7f, v8 -; SDAG-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v16, vcc +; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v11, v10, vcc +; SDAG-NEXT: v_xor_b32_e32 v10, 0x7f, v8 +; SDAG-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v19, vcc ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[8:9] ; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] -; SDAG-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v16, vcc -; SDAG-NEXT: v_or_b32_e32 v16, v17, v18 +; SDAG-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v19, vcc +; SDAG-NEXT: v_or_b32_e32 v10, v10, v18 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] ; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v17, v9, v19 +; SDAG-NEXT: v_or_b32_e32 v11, v9, v19 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] ; SDAG-NEXT: v_cndmask_b32_e32 v20, v21, v20, vcc -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] -; SDAG-NEXT: v_and_b32_e32 v16, 1, v20 -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 -; SDAG-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v20, v11, 0, s[4:5] +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] +; SDAG-NEXT: v_and_b32_e32 v10, 1, v20 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v10 +; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v20, v3, 0, s[4:5] ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; SDAG-NEXT: v_cndmask_b32_e64 v17, v10, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v21, v3, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v16, v2, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v21, v2, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v22, v17, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v23, v16, 0, s[4:5] ; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB0_6 ; SDAG-NEXT: ; %bb.1: ; %udiv-bb15 -; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v8 +; SDAG-NEXT: v_add_i32_e32 v32, vcc, 1, v8 ; SDAG-NEXT: v_sub_i32_e64 v20, s[4:5], 63, v8 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 -; SDAG-NEXT: v_mov_b32_e32 v17, 0 -; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v9, vcc -; SDAG-NEXT: v_lshl_b64 v[20:21], v[2:3], v20 -; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v18, vcc -; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v19, vcc -; SDAG-NEXT: v_or_b32_e32 v18, v30, v32 -; SDAG-NEXT: v_sub_i32_e32 v34, vcc, 0x7f, v8 -; SDAG-NEXT: v_or_b32_e32 v19, v31, v33 -; SDAG-NEXT: v_lshl_b64 v[8:9], v[10:11], v34 -; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v34 -; SDAG-NEXT: v_lshl_b64 v[22:23], v[2:3], v34 +; SDAG-NEXT: v_mov_b32_e32 v10, 0 +; SDAG-NEXT: v_mov_b32_e32 v11, 0 +; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v9, vcc +; SDAG-NEXT: v_lshl_b64 v[20:21], v[16:17], v20 +; SDAG-NEXT: v_addc_u32_e32 v34, vcc, 0, v18, vcc +; SDAG-NEXT: v_addc_u32_e32 v35, vcc, 0, v19, vcc +; SDAG-NEXT: v_or_b32_e32 v18, v32, v34 +; SDAG-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v8 +; SDAG-NEXT: v_or_b32_e32 v19, v33, v35 +; SDAG-NEXT: v_lshl_b64 v[8:9], v[2:3], v24 +; SDAG-NEXT: v_sub_i32_e32 v25, vcc, 64, v24 +; SDAG-NEXT: v_lshl_b64 v[22:23], v[16:17], v24 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] -; SDAG-NEXT: v_lshr_b64 v[18:19], v[2:3], v35 +; SDAG-NEXT: v_lshr_b64 v[18:19], v[16:17], v25 ; SDAG-NEXT: v_or_b32_e32 v9, v9, v19 ; SDAG-NEXT: v_or_b32_e32 v8, v8, v18 -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v34 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24 ; SDAG-NEXT: v_cndmask_b32_e64 v9, v21, v9, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v8, v20, v8, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v23, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v22, s[4:5] -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v34 -; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24 +; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, v3, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, v2, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v18, 0 ; SDAG-NEXT: v_mov_b32_e32 v19, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB0_5 ; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4 -; SDAG-NEXT: v_lshr_b64 v[16:17], v[2:3], v30 -; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v30 -; SDAG-NEXT: v_subrev_i32_e32 v36, vcc, 64, v30 -; SDAG-NEXT: v_lshr_b64 v[37:38], v[10:11], v30 -; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v28 +; SDAG-NEXT: v_lshr_b64 v[10:11], v[16:17], v32 +; SDAG-NEXT: v_sub_i32_e32 v37, vcc, 64, v32 +; SDAG-NEXT: v_subrev_i32_e32 v48, vcc, 64, v32 +; SDAG-NEXT: v_lshr_b64 v[24:25], v[2:3], v32 +; SDAG-NEXT: v_add_i32_e32 v36, vcc, -1, v31 ; SDAG-NEXT: s_mov_b64 s[10:11], 0 ; SDAG-NEXT: v_mov_b32_e32 v22, 0 ; SDAG-NEXT: v_mov_b32_e32 v23, 0 ; SDAG-NEXT: v_mov_b32_e32 v18, 0 ; SDAG-NEXT: v_mov_b32_e32 v19, 0 -; SDAG-NEXT: v_lshl_b64 v[48:49], v[10:11], v35 -; SDAG-NEXT: v_lshr_b64 v[10:11], v[10:11], v36 -; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v29, vcc -; SDAG-NEXT: v_or_b32_e32 v17, v17, v49 -; SDAG-NEXT: v_or_b32_e32 v16, v16, v48 -; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v0, vcc -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30 -; SDAG-NEXT: v_cndmask_b32_e64 v17, v11, v17, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v16, v10, v16, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v38, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v37, s[4:5] -; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v1, vcc -; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 -; SDAG-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc -; SDAG-NEXT: v_mov_b32_e32 v17, 0 +; SDAG-NEXT: v_lshl_b64 v[38:39], v[2:3], v37 +; SDAG-NEXT: v_lshr_b64 v[2:3], v[2:3], v48 +; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v30, vcc +; SDAG-NEXT: v_or_b32_e32 v11, v11, v39 +; SDAG-NEXT: v_or_b32_e32 v10, v10, v38 +; SDAG-NEXT: v_addc_u32_e32 v38, vcc, -1, v0, vcc +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v32 +; SDAG-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v25, 0, v25, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v24, 0, v24, s[4:5] +; SDAG-NEXT: v_addc_u32_e32 v39, vcc, -1, v1, vcc +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32 +; SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v17, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc +; SDAG-NEXT: v_mov_b32_e32 v11, 0 ; SDAG-NEXT: .LBB0_3: ; %udiv-do-while3 ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 -; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v16, 31, v3 +; SDAG-NEXT: v_lshl_b64 v[16:17], v[24:25], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v10, 31, v3 ; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v9 +; SDAG-NEXT: v_lshrrev_b32_e32 v24, 31, v9 ; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v21 +; SDAG-NEXT: v_lshrrev_b32_e32 v25, 31, v21 ; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 -; SDAG-NEXT: v_or_b32_e32 v10, v10, v16 -; SDAG-NEXT: v_or_b32_e32 v2, v2, v38 -; SDAG-NEXT: v_or_b32_e32 v8, v8, v39 +; SDAG-NEXT: v_or_b32_e32 v16, v16, v10 +; SDAG-NEXT: v_or_b32_e32 v2, v2, v24 +; SDAG-NEXT: v_or_b32_e32 v8, v8, v25 ; SDAG-NEXT: v_or_b32_e32 v9, v19, v9 -; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v34, v2 +; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v36, v2 ; SDAG-NEXT: v_or_b32_e32 v8, v18, v8 -; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v35, v3, vcc -; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v36, v10, vcc -; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v37, v11, vcc -; SDAG-NEXT: v_ashrrev_i32_e32 v38, 31, v16 -; SDAG-NEXT: v_and_b32_e32 v39, v38, v28 -; SDAG-NEXT: v_and_b32_e32 v48, v38, v29 -; SDAG-NEXT: v_and_b32_e32 v49, v38, v0 -; SDAG-NEXT: v_and_b32_e32 v16, 1, v38 -; SDAG-NEXT: v_and_b32_e32 v38, v38, v1 -; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v39 +; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v37, v3, vcc +; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v38, v16, vcc +; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v39, v17, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v24, 31, v10 +; SDAG-NEXT: v_and_b32_e32 v25, v24, v31 +; SDAG-NEXT: v_and_b32_e32 v48, v24, v30 +; SDAG-NEXT: v_and_b32_e32 v49, v24, v0 +; SDAG-NEXT: v_and_b32_e32 v10, 1, v24 +; SDAG-NEXT: v_and_b32_e32 v50, v24, v1 +; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v25 ; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v48, vcc -; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v10, v49, vcc -; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v11, v38, vcc -; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v30 -; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc -; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc +; SDAG-NEXT: v_subb_u32_e32 v24, vcc, v16, v49, vcc +; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v17, v50, vcc +; SDAG-NEXT: v_add_i32_e32 v32, vcc, -1, v32 ; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc -; SDAG-NEXT: v_or_b32_e32 v38, v30, v32 -; SDAG-NEXT: v_or_b32_e32 v39, v31, v33 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[38:39] +; SDAG-NEXT: v_addc_u32_e32 v34, vcc, -1, v34, vcc +; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v35, vcc +; SDAG-NEXT: v_or_b32_e32 v16, v32, v34 +; SDAG-NEXT: v_or_b32_e32 v17, v33, v35 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] ; SDAG-NEXT: v_or_b32_e32 v21, v23, v21 ; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; SDAG-NEXT: v_or_b32_e32 v20, v22, v20 -; SDAG-NEXT: v_mov_b32_e32 v23, v17 -; SDAG-NEXT: v_mov_b32_e32 v22, v16 +; SDAG-NEXT: v_mov_b32_e32 v23, v11 +; SDAG-NEXT: v_mov_b32_e32 v22, v10 ; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] ; SDAG-NEXT: s_cbranch_execnz .LBB0_3 ; SDAG-NEXT: ; %bb.4: ; %Flow13 @@ -196,68 +198,70 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_lshl_b64 v[2:3], v[20:21], 1 ; SDAG-NEXT: v_or_b32_e32 v0, v0, v8 ; SDAG-NEXT: v_or_b32_e32 v20, v19, v1 -; SDAG-NEXT: v_or_b32_e32 v21, v17, v3 -; SDAG-NEXT: v_or_b32_e32 v17, v18, v0 -; SDAG-NEXT: v_or_b32_e32 v16, v16, v2 +; SDAG-NEXT: v_or_b32_e32 v22, v11, v3 +; SDAG-NEXT: v_or_b32_e32 v21, v18, v0 +; SDAG-NEXT: v_or_b32_e32 v23, v10, v2 ; SDAG-NEXT: .LBB0_6: ; %Flow16 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] -; SDAG-NEXT: v_ashrrev_i32_e32 v18, 31, v7 -; SDAG-NEXT: v_ashrrev_i32_e32 v19, 31, v15 +; SDAG-NEXT: v_ashrrev_i32_e32 v16, 31, v7 +; SDAG-NEXT: v_ashrrev_i32_e32 v17, 31, v15 +; SDAG-NEXT: v_sub_i32_e32 v0, vcc, 0, v4 ; SDAG-NEXT: v_mov_b32_e32 v9, 0 ; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f -; SDAG-NEXT: v_mov_b32_e32 v22, v18 -; SDAG-NEXT: v_mov_b32_e32 v23, v19 -; SDAG-NEXT: v_xor_b32_e32 v0, v18, v7 -; SDAG-NEXT: v_xor_b32_e32 v1, v18, v6 -; SDAG-NEXT: v_xor_b32_e32 v3, v18, v5 -; SDAG-NEXT: v_xor_b32_e32 v2, v18, v4 -; SDAG-NEXT: v_xor_b32_e32 v6, v19, v15 -; SDAG-NEXT: v_xor_b32_e32 v7, v19, v14 -; SDAG-NEXT: v_xor_b32_e32 v8, v19, v13 -; SDAG-NEXT: v_xor_b32_e32 v10, v19, v12 -; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v18 -; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v18, vcc -; SDAG-NEXT: v_ffbh_u32_e32 v5, v2 -; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v1, v18, vcc -; SDAG-NEXT: v_add_i32_e64 v1, s[4:5], 32, v5 -; SDAG-NEXT: v_ffbh_u32_e32 v11, v3 -; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v0, v18, vcc +; SDAG-NEXT: v_mov_b32_e32 v18, v16 +; SDAG-NEXT: v_mov_b32_e32 v19, v17 +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, 0, v5, vcc +; SDAG-NEXT: v_subb_u32_e32 v8, vcc, 0, v6, vcc +; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v3, v5, v1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v2, v4, v0, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v7, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v4, v6, v8, s[4:5] +; SDAG-NEXT: v_ffbh_u32_e32 v1, v2 +; SDAG-NEXT: v_ffbh_u32_e32 v6, v3 +; SDAG-NEXT: v_cndmask_b32_e64 v5, v7, v0, s[4:5] +; SDAG-NEXT: v_sub_i32_e32 v7, vcc, 0, v12 ; SDAG-NEXT: v_or_b32_e32 v0, v2, v4 -; SDAG-NEXT: v_ffbh_u32_e32 v12, v4 -; SDAG-NEXT: v_min_u32_e32 v11, v1, v11 -; SDAG-NEXT: v_sub_i32_e32 v28, vcc, v10, v19 +; SDAG-NEXT: v_ffbh_u32_e32 v8, v4 +; SDAG-NEXT: v_add_i32_e64 v10, s[4:5], 32, v1 +; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v13, vcc ; SDAG-NEXT: v_or_b32_e32 v1, v3, v5 -; SDAG-NEXT: v_add_i32_e64 v10, s[4:5], 32, v12 -; SDAG-NEXT: v_ffbh_u32_e32 v12, v5 -; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], 64, v11 -; SDAG-NEXT: v_addc_u32_e64 v13, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v29, vcc, v8, v19, vcc -; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; SDAG-NEXT: v_ffbh_u32_e32 v1, v28 -; SDAG-NEXT: v_min_u32_e32 v8, v10, v12 -; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v10, v13, 0, s[6:7] -; SDAG-NEXT: v_subb_u32_e32 v0, vcc, v7, v19, vcc -; SDAG-NEXT: v_add_i32_e64 v7, s[8:9], 32, v1 -; SDAG-NEXT: v_ffbh_u32_e32 v12, v29 -; SDAG-NEXT: v_cndmask_b32_e64 v8, v11, v8, s[6:7] -; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v6, v19, vcc -; SDAG-NEXT: v_or_b32_e32 v6, v28, v0 -; SDAG-NEXT: v_ffbh_u32_e32 v11, v0 -; SDAG-NEXT: v_min_u32_e32 v12, v7, v12 -; SDAG-NEXT: v_or_b32_e32 v7, v29, v1 -; SDAG-NEXT: v_add_i32_e32 v11, vcc, 32, v11 -; SDAG-NEXT: v_ffbh_u32_e32 v13, v1 -; SDAG-NEXT: v_add_i32_e32 v12, vcc, 64, v12 -; SDAG-NEXT: v_addc_u32_e64 v14, s[6:7], 0, 0, vcc +; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], 32, v8 +; SDAG-NEXT: v_ffbh_u32_e32 v30, v5 +; SDAG-NEXT: v_min_u32_e32 v6, v10, v6 +; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v14, vcc +; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[14:15] +; SDAG-NEXT: v_cndmask_b32_e64 v24, v13, v11, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v25, v12, v7, s[4:5] +; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[0:1] +; SDAG-NEXT: v_min_u32_e32 v1, v8, v30 +; SDAG-NEXT: v_add_i32_e64 v6, s[8:9], 64, v6 +; SDAG-NEXT: v_addc_u32_e64 v7, s[8:9], 0, 0, s[8:9] +; SDAG-NEXT: v_subb_u32_e32 v8, vcc, 0, v15, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v0, v14, v10, s[4:5] +; SDAG-NEXT: v_ffbh_u32_e32 v10, v25 +; SDAG-NEXT: v_ffbh_u32_e32 v11, v24 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v12, v7, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v13, v6, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v1, v15, v8, s[4:5] +; SDAG-NEXT: v_or_b32_e32 v6, v25, v0 +; SDAG-NEXT: v_ffbh_u32_e32 v8, v0 +; SDAG-NEXT: v_add_i32_e32 v10, vcc, 32, v10 +; SDAG-NEXT: v_or_b32_e32 v7, v24, v1 +; SDAG-NEXT: v_add_i32_e32 v8, vcc, 32, v8 +; SDAG-NEXT: v_ffbh_u32_e32 v14, v1 +; SDAG-NEXT: v_min_u32_e32 v10, v10, v11 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] -; SDAG-NEXT: v_min_u32_e32 v6, v11, v13 -; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v7, v14, 0, s[6:7] -; SDAG-NEXT: s_or_b64 s[8:9], vcc, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v6, v12, v6, s[6:7] -; SDAG-NEXT: v_sub_i32_e32 v6, vcc, v6, v8 -; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v10, vcc +; SDAG-NEXT: v_min_u32_e32 v6, v8, v14 +; SDAG-NEXT: v_add_i32_e64 v7, s[4:5], 64, v10 +; SDAG-NEXT: v_addc_u32_e64 v8, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc +; SDAG-NEXT: v_sub_i32_e32 v6, vcc, v6, v13 +; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v8, v12, vcc ; SDAG-NEXT: v_xor_b32_e32 v10, 0x7f, v6 ; SDAG-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v9, vcc ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[6:7] @@ -272,7 +276,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] ; SDAG-NEXT: v_and_b32_e32 v10, 1, v12 ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v10 -; SDAG-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] +; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v13, v5, 0, s[4:5] ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; SDAG-NEXT: v_cndmask_b32_e64 v11, v4, 0, s[4:5] @@ -318,7 +322,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v30 ; SDAG-NEXT: v_subrev_i32_e32 v36, vcc, 64, v30 ; SDAG-NEXT: v_lshr_b64 v[37:38], v[4:5], v30 -; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v28 +; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v25 ; SDAG-NEXT: s_mov_b64 s[10:11], 0 ; SDAG-NEXT: v_mov_b32_e32 v14, 0 ; SDAG-NEXT: v_mov_b32_e32 v15, 0 @@ -326,7 +330,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_mov_b32_e32 v13, 0 ; SDAG-NEXT: v_lshl_b64 v[48:49], v[4:5], v35 ; SDAG-NEXT: v_lshr_b64 v[4:5], v[4:5], v36 -; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v29, vcc +; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v24, vcc ; SDAG-NEXT: v_or_b32_e32 v11, v11, v49 ; SDAG-NEXT: v_or_b32_e32 v10, v10, v48 ; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v0, vcc @@ -363,8 +367,8 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_and_b32_e32 v10, 1, v15 ; SDAG-NEXT: v_and_b32_e32 v38, v15, v1 ; SDAG-NEXT: v_and_b32_e32 v39, v15, v0 -; SDAG-NEXT: v_and_b32_e32 v48, v15, v29 -; SDAG-NEXT: v_and_b32_e32 v15, v15, v28 +; SDAG-NEXT: v_and_b32_e32 v48, v15, v24 +; SDAG-NEXT: v_and_b32_e32 v15, v15, v25 ; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v15 ; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v48, vcc ; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v4, v39, vcc @@ -396,14 +400,14 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v10, v10, v2 ; SDAG-NEXT: .LBB0_12: ; %Flow12 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] -; SDAG-NEXT: v_xor_b32_e32 v3, v27, v26 -; SDAG-NEXT: v_xor_b32_e32 v2, v25, v24 -; SDAG-NEXT: v_xor_b32_e32 v7, v23, v22 -; SDAG-NEXT: v_xor_b32_e32 v6, v19, v18 +; SDAG-NEXT: v_xor_b32_e32 v3, v29, v28 +; SDAG-NEXT: v_xor_b32_e32 v2, v27, v26 +; SDAG-NEXT: v_xor_b32_e32 v7, v19, v18 +; SDAG-NEXT: v_xor_b32_e32 v6, v17, v16 ; SDAG-NEXT: v_xor_b32_e32 v4, v20, v3 -; SDAG-NEXT: v_xor_b32_e32 v5, v17, v2 -; SDAG-NEXT: v_xor_b32_e32 v1, v21, v3 -; SDAG-NEXT: v_xor_b32_e32 v0, v16, v2 +; SDAG-NEXT: v_xor_b32_e32 v5, v21, v2 +; SDAG-NEXT: v_xor_b32_e32 v1, v22, v3 +; SDAG-NEXT: v_xor_b32_e32 v0, v23, v2 ; SDAG-NEXT: v_xor_b32_e32 v8, v13, v7 ; SDAG-NEXT: v_xor_b32_e32 v9, v11, v6 ; SDAG-NEXT: v_xor_b32_e32 v11, v14, v7 @@ -1553,118 +1557,119 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; SDAG-NEXT: v_ashrrev_i32_e32 v28, 31, v3 -; SDAG-NEXT: v_ashrrev_i32_e32 v16, 31, v11 -; SDAG-NEXT: v_mov_b32_e32 v17, 0 +; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 0, v0 +; SDAG-NEXT: v_mov_b32_e32 v19, 0 ; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f ; SDAG-NEXT: v_mov_b32_e32 v29, v28 -; SDAG-NEXT: v_xor_b32_e32 v18, v3, v28 -; SDAG-NEXT: v_xor_b32_e32 v19, v2, v28 -; SDAG-NEXT: v_xor_b32_e32 v1, v1, v28 -; SDAG-NEXT: v_xor_b32_e32 v0, v0, v28 -; SDAG-NEXT: v_xor_b32_e32 v11, v11, v16 -; SDAG-NEXT: v_xor_b32_e32 v10, v10, v16 -; SDAG-NEXT: v_xor_b32_e32 v20, v9, v16 -; SDAG-NEXT: v_xor_b32_e32 v9, v8, v16 -; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v0, v28 -; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v1, v28, vcc -; SDAG-NEXT: v_ffbh_u32_e32 v1, v2 -; SDAG-NEXT: v_subb_u32_e32 v0, vcc, v19, v28, vcc -; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], 32, v1 -; SDAG-NEXT: v_ffbh_u32_e32 v21, v3 -; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v18, v28, vcc -; SDAG-NEXT: v_or_b32_e32 v8, v2, v0 -; SDAG-NEXT: v_ffbh_u32_e32 v18, v0 -; SDAG-NEXT: v_min_u32_e32 v19, v19, v21 -; SDAG-NEXT: v_sub_i32_e32 v31, vcc, v9, v16 -; SDAG-NEXT: v_or_b32_e32 v9, v3, v1 +; SDAG-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc +; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v2, vcc +; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] +; SDAG-NEXT: v_cndmask_b32_e64 v17, v1, v17, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v0, v16, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, 0, v3, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v0, v2, v18, s[4:5] +; SDAG-NEXT: v_ffbh_u32_e32 v18, v16 +; SDAG-NEXT: v_ffbh_u32_e32 v20, v17 +; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] +; SDAG-NEXT: v_sub_i32_e32 v21, vcc, 0, v8 +; SDAG-NEXT: v_or_b32_e32 v2, v16, v0 +; SDAG-NEXT: v_ffbh_u32_e32 v22, v0 ; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], 32, v18 -; SDAG-NEXT: v_ffbh_u32_e32 v21, v1 -; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], 64, v19 -; SDAG-NEXT: v_addc_u32_e64 v22, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v30, vcc, v20, v16, vcc -; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[8:9] -; SDAG-NEXT: v_ffbh_u32_e32 v9, v31 -; SDAG-NEXT: v_min_u32_e32 v18, v18, v21 -; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v20, v22, 0, s[6:7] -; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v10, v16, vcc -; SDAG-NEXT: v_add_i32_e64 v21, s[8:9], 32, v9 -; SDAG-NEXT: v_ffbh_u32_e32 v22, v30 -; SDAG-NEXT: v_cndmask_b32_e64 v18, v19, v18, s[6:7] -; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v11, v16, vcc -; SDAG-NEXT: v_or_b32_e32 v10, v31, v8 -; SDAG-NEXT: v_ffbh_u32_e32 v16, v8 -; SDAG-NEXT: v_min_u32_e32 v19, v21, v22 -; SDAG-NEXT: v_or_b32_e32 v11, v30, v9 -; SDAG-NEXT: v_add_i32_e32 v16, vcc, 32, v16 -; SDAG-NEXT: v_ffbh_u32_e32 v21, v9 -; SDAG-NEXT: v_add_i32_e32 v19, vcc, 64, v19 -; SDAG-NEXT: v_addc_u32_e64 v22, s[6:7], 0, 0, vcc -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] -; SDAG-NEXT: v_min_u32_e32 v10, v16, v21 -; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[8:9] -; SDAG-NEXT: v_cndmask_b32_e64 v11, v22, 0, s[6:7] -; SDAG-NEXT: s_or_b64 s[8:9], vcc, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v10, v19, v10, s[6:7] -; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v10, v18 -; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v11, v20, vcc -; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v10 -; SDAG-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v17, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[10:11] +; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v9, vcc +; SDAG-NEXT: v_or_b32_e32 v3, v17, v1 +; SDAG-NEXT: v_add_i32_e64 v22, s[4:5], 32, v22 +; SDAG-NEXT: v_ffbh_u32_e32 v24, v1 +; SDAG-NEXT: v_min_u32_e32 v18, v18, v20 +; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v10, vcc +; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[10:11] +; SDAG-NEXT: v_cndmask_b32_e64 v30, v9, v23, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v31, v8, v21, s[4:5] +; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[2:3] +; SDAG-NEXT: v_min_u32_e32 v3, v22, v24 +; SDAG-NEXT: v_add_i32_e64 v8, s[8:9], 64, v18 +; SDAG-NEXT: v_addc_u32_e64 v9, s[8:9], 0, 0, s[8:9] +; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v11, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v20, s[4:5] +; SDAG-NEXT: v_ffbh_u32_e32 v10, v31 +; SDAG-NEXT: v_ffbh_u32_e32 v20, v30 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v21, v9, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v22, v8, v3, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v18, s[4:5] +; SDAG-NEXT: v_or_b32_e32 v8, v31, v2 +; SDAG-NEXT: v_ffbh_u32_e32 v11, v2 +; SDAG-NEXT: v_add_i32_e32 v10, vcc, 32, v10 +; SDAG-NEXT: v_or_b32_e32 v9, v30, v3 +; SDAG-NEXT: v_add_i32_e32 v11, vcc, 32, v11 +; SDAG-NEXT: v_ffbh_u32_e32 v18, v3 +; SDAG-NEXT: v_min_u32_e32 v10, v10, v20 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; SDAG-NEXT: v_min_u32_e32 v8, v11, v18 +; SDAG-NEXT: v_add_i32_e64 v9, s[4:5], 64, v10 +; SDAG-NEXT: v_addc_u32_e64 v10, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; SDAG-NEXT: v_cndmask_b32_e64 v10, v10, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc +; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v8, v22 +; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v10, v21, vcc +; SDAG-NEXT: v_xor_b32_e32 v10, 0x7f, v8 +; SDAG-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v19, vcc +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[8:9] ; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] -; SDAG-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v17, vcc -; SDAG-NEXT: v_or_b32_e32 v16, v16, v18 +; SDAG-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v19, vcc +; SDAG-NEXT: v_or_b32_e32 v10, v10, v18 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] ; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v17, v11, v19 +; SDAG-NEXT: v_or_b32_e32 v11, v9, v19 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] ; SDAG-NEXT: v_cndmask_b32_e32 v20, v21, v20, vcc -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] -; SDAG-NEXT: v_and_b32_e32 v16, 1, v20 -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 -; SDAG-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] +; SDAG-NEXT: v_and_b32_e32 v10, 1, v20 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v10 +; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v35, v1, 0, s[4:5] ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; SDAG-NEXT: v_cndmask_b32_e64 v32, v0, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v27, v3, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v33, v2, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v27, v17, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v33, v16, 0, s[4:5] ; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB2_6 ; SDAG-NEXT: ; %bb.1: ; %udiv-bb15 -; SDAG-NEXT: v_add_i32_e32 v32, vcc, 1, v10 -; SDAG-NEXT: v_sub_i32_e64 v20, s[4:5], 63, v10 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 -; SDAG-NEXT: v_mov_b32_e32 v17, 0 -; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v11, vcc -; SDAG-NEXT: v_lshl_b64 v[20:21], v[2:3], v20 +; SDAG-NEXT: v_add_i32_e32 v32, vcc, 1, v8 +; SDAG-NEXT: v_sub_i32_e64 v20, s[4:5], 63, v8 +; SDAG-NEXT: v_mov_b32_e32 v10, 0 +; SDAG-NEXT: v_mov_b32_e32 v11, 0 +; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v9, vcc +; SDAG-NEXT: v_lshl_b64 v[20:21], v[16:17], v20 ; SDAG-NEXT: v_addc_u32_e32 v34, vcc, 0, v18, vcc ; SDAG-NEXT: v_addc_u32_e32 v35, vcc, 0, v19, vcc ; SDAG-NEXT: v_or_b32_e32 v18, v32, v34 -; SDAG-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v10 +; SDAG-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v8 ; SDAG-NEXT: v_or_b32_e32 v19, v33, v35 -; SDAG-NEXT: v_lshl_b64 v[10:11], v[0:1], v24 +; SDAG-NEXT: v_lshl_b64 v[8:9], v[0:1], v24 ; SDAG-NEXT: v_sub_i32_e32 v25, vcc, 64, v24 -; SDAG-NEXT: v_lshl_b64 v[22:23], v[2:3], v24 +; SDAG-NEXT: v_lshl_b64 v[22:23], v[16:17], v24 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] -; SDAG-NEXT: v_lshr_b64 v[18:19], v[2:3], v25 -; SDAG-NEXT: v_or_b32_e32 v11, v11, v19 -; SDAG-NEXT: v_or_b32_e32 v10, v10, v18 +; SDAG-NEXT: v_lshr_b64 v[18:19], v[16:17], v25 +; SDAG-NEXT: v_or_b32_e32 v9, v9, v19 +; SDAG-NEXT: v_or_b32_e32 v8, v8, v18 ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24 -; SDAG-NEXT: v_cndmask_b32_e64 v11, v21, v11, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v10, v20, v10, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v9, v21, v9, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v20, v8, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v23, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v22, s[4:5] ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24 -; SDAG-NEXT: v_cndmask_b32_e64 v11, v11, v1, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v10, v10, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, v1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, v0, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v18, 0 ; SDAG-NEXT: v_mov_b32_e32 v19, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB2_5 ; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4 -; SDAG-NEXT: v_lshr_b64 v[16:17], v[2:3], v32 +; SDAG-NEXT: v_lshr_b64 v[10:11], v[16:17], v32 ; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 64, v32 ; SDAG-NEXT: v_subrev_i32_e32 v37, vcc, 64, v32 ; SDAG-NEXT: v_lshr_b64 v[24:25], v[0:1], v32 @@ -1677,43 +1682,43 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_lshl_b64 v[26:27], v[0:1], v26 ; SDAG-NEXT: v_lshr_b64 v[48:49], v[0:1], v37 ; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v30, vcc -; SDAG-NEXT: v_or_b32_e32 v17, v17, v27 -; SDAG-NEXT: v_or_b32_e32 v16, v16, v26 -; SDAG-NEXT: v_addc_u32_e32 v38, vcc, -1, v8, vcc +; SDAG-NEXT: v_or_b32_e32 v11, v11, v27 +; SDAG-NEXT: v_or_b32_e32 v10, v10, v26 +; SDAG-NEXT: v_addc_u32_e32 v38, vcc, -1, v2, vcc ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v32 -; SDAG-NEXT: v_cndmask_b32_e64 v17, v49, v17, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v16, v48, v16, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v11, v49, v11, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, v48, v10, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v27, 0, v25, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v26, 0, v24, s[4:5] -; SDAG-NEXT: v_addc_u32_e32 v39, vcc, -1, v9, vcc +; SDAG-NEXT: v_addc_u32_e32 v39, vcc, -1, v3, vcc ; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32 -; SDAG-NEXT: v_cndmask_b32_e32 v25, v17, v3, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v24, v16, v2, vcc -; SDAG-NEXT: v_mov_b32_e32 v17, 0 +; SDAG-NEXT: v_cndmask_b32_e32 v25, v11, v17, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v24, v10, v16, vcc +; SDAG-NEXT: v_mov_b32_e32 v11, 0 ; SDAG-NEXT: .LBB2_3: ; %udiv-do-while3 ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-NEXT: v_lshl_b64 v[26:27], v[26:27], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v16, 31, v25 +; SDAG-NEXT: v_lshrrev_b32_e32 v10, 31, v25 ; SDAG-NEXT: v_lshl_b64 v[24:25], v[24:25], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v48, 31, v11 -; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v48, 31, v9 +; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v49, 31, v21 ; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 -; SDAG-NEXT: v_or_b32_e32 v26, v26, v16 +; SDAG-NEXT: v_or_b32_e32 v26, v26, v10 ; SDAG-NEXT: v_or_b32_e32 v24, v24, v48 -; SDAG-NEXT: v_or_b32_e32 v10, v10, v49 -; SDAG-NEXT: v_or_b32_e32 v11, v19, v11 -; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v36, v24 -; SDAG-NEXT: v_or_b32_e32 v10, v18, v10 -; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v37, v25, vcc -; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v38, v26, vcc -; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v39, v27, vcc -; SDAG-NEXT: v_ashrrev_i32_e32 v16, 31, v16 -; SDAG-NEXT: v_and_b32_e32 v48, v16, v31 -; SDAG-NEXT: v_and_b32_e32 v49, v16, v30 -; SDAG-NEXT: v_and_b32_e32 v50, v16, v8 -; SDAG-NEXT: v_and_b32_e32 v51, v16, v9 -; SDAG-NEXT: v_and_b32_e32 v16, 1, v16 +; SDAG-NEXT: v_or_b32_e32 v8, v8, v49 +; SDAG-NEXT: v_or_b32_e32 v9, v19, v9 +; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v36, v24 +; SDAG-NEXT: v_or_b32_e32 v8, v18, v8 +; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v37, v25, vcc +; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v38, v26, vcc +; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v39, v27, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v10, 31, v10 +; SDAG-NEXT: v_and_b32_e32 v48, v10, v31 +; SDAG-NEXT: v_and_b32_e32 v49, v10, v30 +; SDAG-NEXT: v_and_b32_e32 v50, v10, v2 +; SDAG-NEXT: v_and_b32_e32 v51, v10, v3 +; SDAG-NEXT: v_and_b32_e32 v10, 1, v10 ; SDAG-NEXT: v_sub_i32_e32 v24, vcc, v24, v48 ; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v25, v49, vcc ; SDAG-NEXT: v_subb_u32_e32 v26, vcc, v26, v50, vcc @@ -1728,137 +1733,138 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v21, v23, v21 ; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; SDAG-NEXT: v_or_b32_e32 v20, v22, v20 -; SDAG-NEXT: v_mov_b32_e32 v23, v17 -; SDAG-NEXT: v_mov_b32_e32 v22, v16 +; SDAG-NEXT: v_mov_b32_e32 v23, v11 +; SDAG-NEXT: v_mov_b32_e32 v22, v10 ; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] ; SDAG-NEXT: s_cbranch_execnz .LBB2_3 ; SDAG-NEXT: ; %bb.4: ; %Flow13 ; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB2_5: ; %Flow14 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] -; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 +; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v22, 31, v21 ; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 -; SDAG-NEXT: v_or_b32_e32 v10, v10, v22 -; SDAG-NEXT: v_or_b32_e32 v35, v19, v11 -; SDAG-NEXT: v_or_b32_e32 v27, v17, v21 -; SDAG-NEXT: v_or_b32_e32 v32, v18, v10 -; SDAG-NEXT: v_or_b32_e32 v33, v16, v20 +; SDAG-NEXT: v_or_b32_e32 v8, v8, v22 +; SDAG-NEXT: v_or_b32_e32 v35, v19, v9 +; SDAG-NEXT: v_or_b32_e32 v27, v11, v21 +; SDAG-NEXT: v_or_b32_e32 v32, v18, v8 +; SDAG-NEXT: v_or_b32_e32 v33, v10, v20 ; SDAG-NEXT: .LBB2_6: ; %Flow16 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: v_ashrrev_i32_e32 v26, 31, v7 -; SDAG-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; SDAG-NEXT: v_mov_b32_e32 v17, 0 +; SDAG-NEXT: v_sub_i32_e32 v8, vcc, 0, v4 +; SDAG-NEXT: v_mov_b32_e32 v18, 0 ; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f ; SDAG-NEXT: v_mov_b32_e32 v34, v26 -; SDAG-NEXT: v_xor_b32_e32 v10, v7, v26 -; SDAG-NEXT: v_xor_b32_e32 v11, v6, v26 -; SDAG-NEXT: v_xor_b32_e32 v5, v5, v26 -; SDAG-NEXT: v_xor_b32_e32 v4, v4, v26 -; SDAG-NEXT: v_xor_b32_e32 v15, v15, v16 -; SDAG-NEXT: v_xor_b32_e32 v14, v14, v16 -; SDAG-NEXT: v_xor_b32_e32 v13, v13, v16 -; SDAG-NEXT: v_xor_b32_e32 v12, v12, v16 -; SDAG-NEXT: v_sub_i32_e32 v6, vcc, v4, v26 -; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v5, v26, vcc -; SDAG-NEXT: v_ffbh_u32_e32 v5, v6 -; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v11, v26, vcc -; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], 32, v5 -; SDAG-NEXT: v_ffbh_u32_e32 v18, v7 -; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v10, v26, vcc -; SDAG-NEXT: v_or_b32_e32 v10, v6, v4 -; SDAG-NEXT: v_ffbh_u32_e32 v19, v4 -; SDAG-NEXT: v_min_u32_e32 v18, v11, v18 -; SDAG-NEXT: v_sub_i32_e32 v37, vcc, v12, v16 -; SDAG-NEXT: v_or_b32_e32 v11, v7, v5 -; SDAG-NEXT: v_add_i32_e64 v12, s[4:5], 32, v19 -; SDAG-NEXT: v_ffbh_u32_e32 v19, v5 -; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], 64, v18 -; SDAG-NEXT: v_addc_u32_e64 v20, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v36, vcc, v13, v16, vcc -; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] +; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v5, vcc +; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v6, vcc +; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v9, v5, v9, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v4, v8, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v5, vcc, 0, v7, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v4, v6, v10, s[4:5] +; SDAG-NEXT: v_ffbh_u32_e32 v10, v8 +; SDAG-NEXT: v_ffbh_u32_e32 v11, v9 +; SDAG-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] +; SDAG-NEXT: v_sub_i32_e32 v19, vcc, 0, v12 +; SDAG-NEXT: v_or_b32_e32 v6, v8, v4 +; SDAG-NEXT: v_ffbh_u32_e32 v20, v4 +; SDAG-NEXT: v_add_i32_e64 v10, s[4:5], 32, v10 +; SDAG-NEXT: v_subb_u32_e32 v21, vcc, 0, v13, vcc +; SDAG-NEXT: v_or_b32_e32 v7, v9, v5 +; SDAG-NEXT: v_add_i32_e64 v20, s[4:5], 32, v20 +; SDAG-NEXT: v_ffbh_u32_e32 v22, v5 +; SDAG-NEXT: v_min_u32_e32 v10, v10, v11 +; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v14, vcc +; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[14:15] +; SDAG-NEXT: v_cndmask_b32_e64 v36, v13, v21, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v37, v12, v19, s[4:5] +; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[6:7] +; SDAG-NEXT: v_min_u32_e32 v7, v20, v22 +; SDAG-NEXT: v_add_i32_e64 v10, s[8:9], 64, v10 +; SDAG-NEXT: v_addc_u32_e64 v12, s[8:9], 0, 0, s[8:9] +; SDAG-NEXT: v_subb_u32_e32 v13, vcc, 0, v15, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v11, s[4:5] ; SDAG-NEXT: v_ffbh_u32_e32 v11, v37 -; SDAG-NEXT: v_min_u32_e32 v12, v12, v19 -; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v19, v20, 0, s[6:7] -; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v14, v16, vcc -; SDAG-NEXT: v_add_i32_e64 v13, s[8:9], 32, v11 ; SDAG-NEXT: v_ffbh_u32_e32 v14, v36 -; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v12, s[6:7] -; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v15, v16, vcc -; SDAG-NEXT: v_or_b32_e32 v12, v37, v10 -; SDAG-NEXT: v_ffbh_u32_e32 v15, v10 -; SDAG-NEXT: v_min_u32_e32 v14, v13, v14 -; SDAG-NEXT: v_or_b32_e32 v13, v36, v11 -; SDAG-NEXT: v_add_i32_e32 v15, vcc, 32, v15 -; SDAG-NEXT: v_ffbh_u32_e32 v16, v11 -; SDAG-NEXT: v_add_i32_e32 v14, vcc, 64, v14 -; SDAG-NEXT: v_addc_u32_e64 v20, s[6:7], 0, 0, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v12, v12, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v19, v10, v7, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v13, s[4:5] +; SDAG-NEXT: v_or_b32_e32 v10, v37, v6 +; SDAG-NEXT: v_ffbh_u32_e32 v13, v6 +; SDAG-NEXT: v_add_i32_e32 v15, vcc, 32, v11 +; SDAG-NEXT: v_or_b32_e32 v11, v36, v7 +; SDAG-NEXT: v_add_i32_e32 v13, vcc, 32, v13 +; SDAG-NEXT: v_ffbh_u32_e32 v20, v7 +; SDAG-NEXT: v_min_u32_e32 v14, v15, v14 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] +; SDAG-NEXT: v_min_u32_e32 v10, v13, v20 +; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], 64, v14 +; SDAG-NEXT: v_addc_u32_e64 v13, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v13, v13, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc +; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v10, v19 +; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v13, v12, vcc +; SDAG-NEXT: v_xor_b32_e32 v14, 0x7f, v10 +; SDAG-NEXT: v_subbrev_u32_e32 v12, vcc, 0, v18, vcc +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[10:11] +; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; SDAG-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v18, vcc +; SDAG-NEXT: v_or_b32_e32 v14, v14, v12 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13] +; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v15, v11, v13 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[12:13] -; SDAG-NEXT: v_min_u32_e32 v12, v15, v16 -; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[10:11] -; SDAG-NEXT: v_cndmask_b32_e64 v13, v20, 0, s[6:7] -; SDAG-NEXT: s_or_b64 s[8:9], vcc, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v12, v14, v12, s[6:7] -; SDAG-NEXT: v_sub_i32_e32 v12, vcc, v12, v18 -; SDAG-NEXT: v_subb_u32_e32 v13, vcc, v13, v19, vcc -; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v12 -; SDAG-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v17, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[12:13] -; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; SDAG-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v17, vcc -; SDAG-NEXT: v_or_b32_e32 v16, v16, v14 +; SDAG-NEXT: v_cndmask_b32_e32 v18, v18, v19, vcc ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15] -; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v17, v13, v15 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] -; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] -; SDAG-NEXT: v_and_b32_e32 v16, 1, v18 -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 -; SDAG-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] +; SDAG-NEXT: v_and_b32_e32 v14, 1, v18 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v14 +; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v19, v5, 0, s[4:5] ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; SDAG-NEXT: v_cndmask_b32_e64 v18, v4, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v17, v7, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v16, v6, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v15, v9, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v14, v8, 0, s[4:5] ; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB2_12 ; SDAG-NEXT: ; %bb.7: ; %udiv-bb1 -; SDAG-NEXT: v_add_i32_e32 v38, vcc, 1, v12 -; SDAG-NEXT: v_sub_i32_e64 v18, s[4:5], 63, v12 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 -; SDAG-NEXT: v_mov_b32_e32 v17, 0 -; SDAG-NEXT: v_addc_u32_e32 v39, vcc, 0, v13, vcc -; SDAG-NEXT: v_lshl_b64 v[18:19], v[6:7], v18 -; SDAG-NEXT: v_addc_u32_e32 v48, vcc, 0, v14, vcc -; SDAG-NEXT: v_addc_u32_e32 v49, vcc, 0, v15, vcc -; SDAG-NEXT: v_or_b32_e32 v13, v38, v48 -; SDAG-NEXT: v_sub_i32_e32 v15, vcc, 0x7f, v12 -; SDAG-NEXT: v_or_b32_e32 v14, v39, v49 -; SDAG-NEXT: v_lshl_b64 v[20:21], v[4:5], v15 -; SDAG-NEXT: v_sub_i32_e32 v12, vcc, 64, v15 -; SDAG-NEXT: v_lshl_b64 v[22:23], v[6:7], v15 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[13:14] -; SDAG-NEXT: v_lshr_b64 v[12:13], v[6:7], v12 -; SDAG-NEXT: v_or_b32_e32 v13, v21, v13 -; SDAG-NEXT: v_or_b32_e32 v12, v20, v12 -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v15 -; SDAG-NEXT: v_cndmask_b32_e64 v14, v19, v13, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v12, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v23, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v22, s[4:5] -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 -; SDAG-NEXT: v_cndmask_b32_e64 v15, v14, v5, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v14, v18, v4, s[4:5] +; SDAG-NEXT: v_add_i32_e32 v38, vcc, 1, v10 +; SDAG-NEXT: v_sub_i32_e64 v18, s[4:5], 63, v10 +; SDAG-NEXT: v_mov_b32_e32 v14, 0 +; SDAG-NEXT: v_mov_b32_e32 v15, 0 +; SDAG-NEXT: v_addc_u32_e32 v39, vcc, 0, v11, vcc +; SDAG-NEXT: v_lshl_b64 v[18:19], v[8:9], v18 +; SDAG-NEXT: v_addc_u32_e32 v48, vcc, 0, v12, vcc +; SDAG-NEXT: v_addc_u32_e32 v49, vcc, 0, v13, vcc +; SDAG-NEXT: v_or_b32_e32 v11, v38, v48 +; SDAG-NEXT: v_sub_i32_e32 v13, vcc, 0x7f, v10 +; SDAG-NEXT: v_or_b32_e32 v12, v39, v49 +; SDAG-NEXT: v_lshl_b64 v[20:21], v[4:5], v13 +; SDAG-NEXT: v_sub_i32_e32 v10, vcc, 64, v13 +; SDAG-NEXT: v_lshl_b64 v[22:23], v[8:9], v13 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[11:12] +; SDAG-NEXT: v_lshr_b64 v[10:11], v[8:9], v10 +; SDAG-NEXT: v_or_b32_e32 v11, v21, v11 +; SDAG-NEXT: v_or_b32_e32 v10, v20, v10 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v13 +; SDAG-NEXT: v_cndmask_b32_e64 v12, v19, v11, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v10, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v23, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v22, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v13 +; SDAG-NEXT: v_cndmask_b32_e64 v13, v12, v5, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v12, v18, v4, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v18, 0 ; SDAG-NEXT: v_mov_b32_e32 v19, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB2_11 ; SDAG-NEXT: ; %bb.8: ; %udiv-preheader -; SDAG-NEXT: v_lshr_b64 v[16:17], v[6:7], v38 +; SDAG-NEXT: v_lshr_b64 v[14:15], v[8:9], v38 ; SDAG-NEXT: v_sub_i32_e32 v24, vcc, 64, v38 ; SDAG-NEXT: v_subrev_i32_e32 v51, vcc, 64, v38 ; SDAG-NEXT: v_lshr_b64 v[22:23], v[4:5], v38 @@ -1871,42 +1877,42 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_lshl_b64 v[24:25], v[4:5], v24 ; SDAG-NEXT: v_lshr_b64 v[53:54], v[4:5], v51 ; SDAG-NEXT: v_addc_u32_e32 v51, vcc, -1, v36, vcc -; SDAG-NEXT: v_or_b32_e32 v17, v17, v25 -; SDAG-NEXT: v_or_b32_e32 v16, v16, v24 -; SDAG-NEXT: v_addc_u32_e32 v52, vcc, -1, v10, vcc +; SDAG-NEXT: v_or_b32_e32 v15, v15, v25 +; SDAG-NEXT: v_or_b32_e32 v14, v14, v24 +; SDAG-NEXT: v_addc_u32_e32 v52, vcc, -1, v6, vcc ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v38 -; SDAG-NEXT: v_cndmask_b32_e64 v17, v54, v17, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v16, v53, v16, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v15, v54, v15, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v14, v53, v14, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v25, 0, v23, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v24, 0, v22, s[4:5] -; SDAG-NEXT: v_addc_u32_e32 v53, vcc, -1, v11, vcc +; SDAG-NEXT: v_addc_u32_e32 v53, vcc, -1, v7, vcc ; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v38 -; SDAG-NEXT: v_cndmask_b32_e32 v23, v17, v7, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v22, v16, v6, vcc -; SDAG-NEXT: v_mov_b32_e32 v17, 0 +; SDAG-NEXT: v_cndmask_b32_e32 v23, v15, v9, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v22, v14, v8, vcc +; SDAG-NEXT: v_mov_b32_e32 v15, 0 ; SDAG-NEXT: .LBB2_9: ; %udiv-do-while ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-NEXT: v_lshl_b64 v[24:25], v[24:25], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v16, 31, v23 +; SDAG-NEXT: v_lshrrev_b32_e32 v14, 31, v23 ; SDAG-NEXT: v_lshl_b64 v[22:23], v[22:23], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v54, 31, v15 -; SDAG-NEXT: v_lshl_b64 v[14:15], v[14:15], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v55, 31, v13 +; SDAG-NEXT: v_lshrrev_b32_e32 v54, 31, v13 ; SDAG-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 -; SDAG-NEXT: v_or_b32_e32 v24, v24, v16 +; SDAG-NEXT: v_lshrrev_b32_e32 v55, 31, v11 +; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 +; SDAG-NEXT: v_or_b32_e32 v24, v24, v14 ; SDAG-NEXT: v_or_b32_e32 v22, v22, v54 -; SDAG-NEXT: v_or_b32_e32 v14, v14, v55 -; SDAG-NEXT: v_or_b32_e32 v15, v19, v15 -; SDAG-NEXT: v_or_b32_e32 v13, v21, v13 -; SDAG-NEXT: v_or_b32_e32 v14, v18, v14 -; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v50, v22 -; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v51, v23, vcc -; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v52, v24, vcc -; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v53, v25, vcc -; SDAG-NEXT: v_ashrrev_i32_e32 v21, 31, v16 -; SDAG-NEXT: v_and_b32_e32 v16, 1, v21 -; SDAG-NEXT: v_and_b32_e32 v54, v21, v11 -; SDAG-NEXT: v_and_b32_e32 v55, v21, v10 +; SDAG-NEXT: v_or_b32_e32 v12, v12, v55 +; SDAG-NEXT: v_or_b32_e32 v13, v19, v13 +; SDAG-NEXT: v_or_b32_e32 v11, v21, v11 +; SDAG-NEXT: v_or_b32_e32 v12, v18, v12 +; SDAG-NEXT: v_sub_i32_e32 v14, vcc, v50, v22 +; SDAG-NEXT: v_subb_u32_e32 v14, vcc, v51, v23, vcc +; SDAG-NEXT: v_subb_u32_e32 v14, vcc, v52, v24, vcc +; SDAG-NEXT: v_subb_u32_e32 v14, vcc, v53, v25, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v21, 31, v14 +; SDAG-NEXT: v_and_b32_e32 v14, 1, v21 +; SDAG-NEXT: v_and_b32_e32 v54, v21, v7 +; SDAG-NEXT: v_and_b32_e32 v55, v21, v6 ; SDAG-NEXT: v_and_b32_e32 v40, v21, v36 ; SDAG-NEXT: v_and_b32_e32 v21, v21, v37 ; SDAG-NEXT: v_sub_i32_e32 v22, vcc, v22, v21 @@ -1921,89 +1927,87 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v54, v38, v48 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[54:55] ; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; SDAG-NEXT: v_or_b32_e32 v12, v20, v12 -; SDAG-NEXT: v_mov_b32_e32 v21, v17 -; SDAG-NEXT: v_mov_b32_e32 v20, v16 +; SDAG-NEXT: v_or_b32_e32 v10, v20, v10 +; SDAG-NEXT: v_mov_b32_e32 v21, v15 +; SDAG-NEXT: v_mov_b32_e32 v20, v14 ; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] ; SDAG-NEXT: s_cbranch_execnz .LBB2_9 ; SDAG-NEXT: ; %bb.10: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB2_11: ; %Flow11 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] -; SDAG-NEXT: v_lshl_b64 v[14:15], v[14:15], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v13 ; SDAG-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 -; SDAG-NEXT: v_or_b32_e32 v14, v14, v20 -; SDAG-NEXT: v_or_b32_e32 v19, v19, v15 -; SDAG-NEXT: v_or_b32_e32 v17, v17, v13 -; SDAG-NEXT: v_or_b32_e32 v18, v18, v14 -; SDAG-NEXT: v_or_b32_e32 v16, v16, v12 +; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v11 +; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 +; SDAG-NEXT: v_or_b32_e32 v12, v12, v20 +; SDAG-NEXT: v_or_b32_e32 v19, v19, v13 +; SDAG-NEXT: v_or_b32_e32 v15, v15, v11 +; SDAG-NEXT: v_or_b32_e32 v18, v18, v12 +; SDAG-NEXT: v_or_b32_e32 v14, v14, v10 ; SDAG-NEXT: .LBB2_12: ; %Flow12 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] -; SDAG-NEXT: v_mul_lo_u32 v14, v33, v9 -; SDAG-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v33, v8, 0 -; SDAG-NEXT: v_mul_lo_u32 v24, v27, v8 -; SDAG-NEXT: v_mul_lo_u32 v25, v35, v31 -; SDAG-NEXT: v_mul_lo_u32 v35, v32, v30 -; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v31, v33, 0 -; SDAG-NEXT: v_mov_b32_e32 v15, 0 -; SDAG-NEXT: v_mul_lo_u32 v38, v16, v11 -; SDAG-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v16, v10, 0 -; SDAG-NEXT: v_mul_lo_u32 v39, v17, v10 +; SDAG-NEXT: v_mul_lo_u32 v12, v33, v3 +; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v33, v2, 0 +; SDAG-NEXT: v_mul_lo_u32 v24, v27, v2 +; SDAG-NEXT: v_mul_lo_u32 v35, v35, v31 +; SDAG-NEXT: v_mul_lo_u32 v38, v32, v30 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v31, v33, 0 +; SDAG-NEXT: v_mov_b32_e32 v13, 0 +; SDAG-NEXT: v_mul_lo_u32 v25, v14, v7 +; SDAG-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v14, v6, 0 +; SDAG-NEXT: v_mul_lo_u32 v39, v15, v6 ; SDAG-NEXT: v_mul_lo_u32 v19, v19, v37 ; SDAG-NEXT: v_mul_lo_u32 v48, v18, v36 -; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v37, v16, 0 -; SDAG-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; SDAG-NEXT: v_mov_b32_e32 v14, v9 -; SDAG-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v30, v33, v[14:15] -; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 -; SDAG-NEXT: v_add_i32_e64 v14, s[4:5], v21, v38 -; SDAG-NEXT: v_add_i32_e64 v13, s[4:5], v13, v24 +; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v37, v14, 0 +; SDAG-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; SDAG-NEXT: v_mov_b32_e32 v12, v3 +; SDAG-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v30, v33, v[12:13] +; SDAG-NEXT: v_sub_i32_e32 v12, vcc, v16, v2 +; SDAG-NEXT: v_add_i32_e64 v16, s[4:5], v21, v25 +; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v11, v24 ; SDAG-NEXT: v_mov_b32_e32 v24, v23 -; SDAG-NEXT: v_mov_b32_e32 v23, v15 -; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v31, v27, v[22:23] -; SDAG-NEXT: v_xor_b32_e32 v33, v2, v28 -; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], v14, v39 -; SDAG-NEXT: v_mov_b32_e32 v14, v11 -; SDAG-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v36, v16, v[14:15] -; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v32, v31, v[12:13] -; SDAG-NEXT: v_mov_b32_e32 v2, v9 -; SDAG-NEXT: v_add_i32_e64 v13, s[4:5], v24, v2 -; SDAG-NEXT: v_addc_u32_e64 v14, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v2, v8 -; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v3, v2, vcc +; SDAG-NEXT: v_mov_b32_e32 v23, v13 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v31, v27, v[22:23] +; SDAG-NEXT: v_xor_b32_e32 v33, v12, v28 +; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], v16, v39 +; SDAG-NEXT: v_mov_b32_e32 v12, v7 +; SDAG-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v36, v14, v[12:13] +; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v32, v31, v[10:11] +; SDAG-NEXT: v_add_i32_e64 v24, s[4:5], v24, v3 +; SDAG-NEXT: v_addc_u32_e64 v25, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v17, v2, vcc ; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v18, v37, v[20:21] -; SDAG-NEXT: v_mov_b32_e32 v18, v23 -; SDAG-NEXT: v_mov_b32_e32 v23, v15 -; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v37, v17, v[22:23] -; SDAG-NEXT: v_add_i32_e64 v20, s[4:5], v25, v12 -; SDAG-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v30, v27, v[13:14] -; SDAG-NEXT: v_xor_b32_e32 v16, v16, v29 +; SDAG-NEXT: v_mov_b32_e32 v14, v23 +; SDAG-NEXT: v_mov_b32_e32 v23, v13 +; SDAG-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v37, v15, v[22:23] +; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v35, v11 +; SDAG-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v30, v27, v[24:25] +; SDAG-NEXT: v_xor_b32_e32 v7, v7, v29 ; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v19, v3 -; SDAG-NEXT: v_add_i32_e64 v14, s[4:5], v18, v9 -; SDAG-NEXT: v_addc_u32_e64 v15, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v18, v8 -; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], v35, v20 +; SDAG-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 +; SDAG-NEXT: v_addc_u32_e64 v14, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v18, v12 +; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], v38, v11 ; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v48, v3 -; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v36, v17, v[14:15] -; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 -; SDAG-NEXT: v_addc_u32_e64 v12, s[4:5], v13, v19, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v0, vcc, v0, v11, vcc -; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], v8, v2 -; SDAG-NEXT: v_addc_u32_e64 v9, s[4:5], v9, v3, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc +; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v36, v15, v[13:14] +; SDAG-NEXT: v_add_i32_e64 v10, s[4:5], v16, v10 +; SDAG-NEXT: v_addc_u32_e64 v13, s[4:5], v17, v19, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v0, vcc, v0, v10, vcc +; SDAG-NEXT: v_add_i32_e64 v10, s[4:5], v11, v2 +; SDAG-NEXT: v_addc_u32_e64 v11, s[4:5], v12, v3, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v13, vcc ; SDAG-NEXT: v_xor_b32_e32 v2, v0, v28 ; SDAG-NEXT: v_xor_b32_e32 v3, v1, v29 ; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v33, v28 -; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v16, v29, vcc +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v7, v29, vcc ; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v2, v28, vcc ; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v29, vcc -; SDAG-NEXT: v_sub_i32_e32 v6, vcc, v6, v10 -; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v18, vcc +; SDAG-NEXT: v_sub_i32_e32 v6, vcc, v8, v6 +; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v9, v18, vcc ; SDAG-NEXT: v_xor_b32_e32 v6, v6, v26 -; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v4, v8, vcc +; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v4, v10, vcc ; SDAG-NEXT: v_xor_b32_e32 v7, v7, v34 -; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v9, vcc +; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v11, vcc ; SDAG-NEXT: v_xor_b32_e32 v8, v4, v26 ; SDAG-NEXT: v_xor_b32_e32 v9, v5, v34 ; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v6, v26 diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll index 687188ed5ca39..d4ff845e1edf3 100644 --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -331,15 +331,14 @@ bb3: ; preds = %bb3, %bb define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GFX9-LABEL: sdiv32_invariant_denom: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x2c -; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_mov_b32 s3, 0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s2, s3, 31 -; GFX9-NEXT: s_add_i32 s3, s3, s2 -; GFX9-NEXT: s_xor_b32 s3, s3, s2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_sub_i32 s5, 0, s3 +; GFX9-NEXT: s_abs_i32 s2, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX9-NEXT: s_sub_i32 s5, 0, s2 +; GFX9-NEXT: s_ashr_i32 s4, s4, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -350,25 +349,25 @@ define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: .LBB2_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_mul_hi_u32 s6, s4, s5 -; GFX9-NEXT: s_mul_i32 s7, s6, s3 -; GFX9-NEXT: s_sub_i32 s7, s4, s7 +; GFX9-NEXT: s_mul_hi_u32 s6, s3, s5 +; GFX9-NEXT: s_mul_i32 s7, s6, s2 +; GFX9-NEXT: s_sub_i32 s7, s3, s7 ; GFX9-NEXT: s_add_i32 s8, s6, 1 -; GFX9-NEXT: s_sub_i32 s9, s7, s3 -; GFX9-NEXT: s_cmp_ge_u32 s7, s3 +; GFX9-NEXT: s_sub_i32 s9, s7, s2 +; GFX9-NEXT: s_cmp_ge_u32 s7, s2 ; GFX9-NEXT: s_cselect_b32 s6, s8, s6 ; GFX9-NEXT: s_cselect_b32 s7, s9, s7 ; GFX9-NEXT: s_add_i32 s8, s6, 1 -; GFX9-NEXT: s_cmp_ge_u32 s7, s3 +; GFX9-NEXT: s_cmp_ge_u32 s7, s2 ; GFX9-NEXT: s_cselect_b32 s6, s8, s6 -; GFX9-NEXT: s_xor_b32 s6, s6, s2 -; GFX9-NEXT: s_sub_i32 s6, s6, s2 -; GFX9-NEXT: s_add_i32 s4, s4, 1 +; GFX9-NEXT: s_xor_b32 s6, s6, s4 +; GFX9-NEXT: s_sub_i32 s6, s6, s4 +; GFX9-NEXT: s_add_i32 s3, s3, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, 4 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_cmpk_eq_i32 s4, 0x400 +; GFX9-NEXT: s_cmpk_eq_i32 s3, 0x400 ; GFX9-NEXT: s_cbranch_scc0 .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm @@ -377,12 +376,11 @@ define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_load_dword s3, s[0:1], 0x2c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ashr_i32 s2, s3, 31 +; GFX10-NEXT: s_abs_i32 s2, s3 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX10-NEXT: s_add_i32 s3, s3, s2 -; GFX10-NEXT: s_xor_b32 s3, s3, s2 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX10-NEXT: s_sub_i32 s4, 0, s3 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX10-NEXT: s_sub_i32 s4, 0, s2 +; GFX10-NEXT: s_ashr_i32 s3, s3, 31 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -395,19 +393,19 @@ define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: .LBB2_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_mul_hi_u32 s6, s4, s5 -; GFX10-NEXT: s_mul_i32 s7, s6, s3 +; GFX10-NEXT: s_mul_i32 s7, s6, s2 ; GFX10-NEXT: s_add_i32 s8, s6, 1 ; GFX10-NEXT: s_sub_i32 s7, s4, s7 -; GFX10-NEXT: s_sub_i32 s9, s7, s3 -; GFX10-NEXT: s_cmp_ge_u32 s7, s3 +; GFX10-NEXT: s_sub_i32 s9, s7, s2 +; GFX10-NEXT: s_cmp_ge_u32 s7, s2 ; GFX10-NEXT: s_cselect_b32 s6, s8, s6 ; GFX10-NEXT: s_cselect_b32 s7, s9, s7 ; GFX10-NEXT: s_add_i32 s8, s6, 1 -; GFX10-NEXT: s_cmp_ge_u32 s7, s3 +; GFX10-NEXT: s_cmp_ge_u32 s7, s2 ; GFX10-NEXT: s_cselect_b32 s6, s8, s6 ; GFX10-NEXT: s_add_i32 s4, s4, 1 -; GFX10-NEXT: s_xor_b32 s6, s6, s2 -; GFX10-NEXT: s_sub_i32 s6, s6, s2 +; GFX10-NEXT: s_xor_b32 s6, s6, s3 +; GFX10-NEXT: s_sub_i32 s6, s6, s3 ; GFX10-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] @@ -425,22 +423,20 @@ define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_ashr_i32 s2, s3, 31 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_add_i32 s3, s3, s2 -; GFX11-NEXT: s_xor_b32 s3, s3, s2 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX11-NEXT: s_sub_i32 s4, 0, s3 +; GFX11-NEXT: s_abs_i32 s2, s3 +; GFX11-NEXT: s_ashr_i32 s3, s3, 31 +; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX11-NEXT: s_sub_i32 s4, 0, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_readfirstlane_b32 s5, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_mul_i32 s4, s4, s5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_mul_hi_u32 s6, s5, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_add_i32 s5, s5, s6 @@ -449,21 +445,21 @@ define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_mul_hi_u32 s6, s4, s5 -; GFX11-NEXT: s_mul_i32 s7, s6, s3 +; GFX11-NEXT: s_mul_i32 s7, s6, s2 ; GFX11-NEXT: s_add_i32 s8, s6, 1 ; GFX11-NEXT: s_sub_i32 s7, s4, s7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_sub_i32 s9, s7, s3 -; GFX11-NEXT: s_cmp_ge_u32 s7, s3 +; GFX11-NEXT: s_sub_i32 s9, s7, s2 +; GFX11-NEXT: s_cmp_ge_u32 s7, s2 ; GFX11-NEXT: s_cselect_b32 s6, s8, s6 ; GFX11-NEXT: s_cselect_b32 s7, s9, s7 ; GFX11-NEXT: s_add_i32 s8, s6, 1 -; GFX11-NEXT: s_cmp_ge_u32 s7, s3 +; GFX11-NEXT: s_cmp_ge_u32 s7, s2 ; GFX11-NEXT: s_cselect_b32 s6, s8, s6 ; GFX11-NEXT: s_add_i32 s4, s4, 1 -; GFX11-NEXT: s_xor_b32 s6, s6, s2 +; GFX11-NEXT: s_xor_b32 s6, s6, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_sub_i32 s6, s6, s2 +; GFX11-NEXT: s_sub_i32 s6, s6, s3 ; GFX11-NEXT: v_mov_b32_e32 v1, s6 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, 4 @@ -495,14 +491,12 @@ define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-LABEL: srem32_invariant_denom: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s3, s2, 31 -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: s_xor_b32 s2, s2, s3 +; GFX9-NEXT: s_abs_i32 s2, s2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_sub_i32 s4, 0, s2 -; GFX9-NEXT: s_mov_b32 s3, 0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -524,7 +518,6 @@ define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: s_cselect_b32 s5, s6, s5 ; GFX9-NEXT: s_add_i32 s3, s3, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, 4 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 @@ -537,10 +530,8 @@ define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ashr_i32 s3, s2, 31 +; GFX10-NEXT: s_abs_i32 s2, s2 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX10-NEXT: s_add_i32 s2, s2, s3 -; GFX10-NEXT: s_xor_b32 s2, s2, s3 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX10-NEXT: s_sub_i32 s3, 0, s2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -581,10 +572,7 @@ define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_ashr_i32 s3, s2, 31 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_add_i32 s2, s2, s3 -; GFX11-NEXT: s_xor_b32 s2, s2, s3 +; GFX11-NEXT: s_abs_i32 s2, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX11-NEXT: s_sub_i32 s3, 0, s2 diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll index f950717c591a9..3e6de32492457 100644 --- a/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll +++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll @@ -17,15 +17,16 @@ define bfloat @sitofp_i128_to_bf16(i128 %x) { ; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-NEXT: s_cbranch_execz .LBB0_14 ; GCN-NEXT: ; %bb.1: ; %itofp-if-end -; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v3 -; GCN-NEXT: v_xor_b32_e32 v0, v5, v0 -; GCN-NEXT: v_xor_b32_e32 v1, v5, v1 -; GCN-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v5 -; GCN-NEXT: v_xor_b32_e32 v2, v5, v2 -; GCN-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc -; GCN-NEXT: v_xor_b32_e32 v6, v5, v3 -; GCN-NEXT: v_subb_co_u32_e32 v4, vcc, v2, v5, vcc -; GCN-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc +; GCN-NEXT: v_sub_co_u32_e32 v4, vcc, 0, v0 +; GCN-NEXT: v_subb_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN-NEXT: v_subb_co_u32_e32 v6, vcc, 0, v2, vcc +; GCN-NEXT: v_subb_co_u32_e32 v7, vcc, 0, v3, vcc +; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GCN-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GCN-NEXT: v_ffbh_u32_e32 v2, v4 ; GCN-NEXT: v_add_u32_e32 v2, 32, v2 ; GCN-NEXT: v_ffbh_u32_e32 v6, v5 @@ -40,7 +41,6 @@ define bfloat @sitofp_i128_to_bf16(i128 %x) { ; GCN-NEXT: v_sub_u32_e32 v6, 0x80, v7 ; GCN-NEXT: v_sub_u32_e32 v2, 0x7f, v7 ; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 25, v6 -; GCN-NEXT: ; implicit-def: $vgpr8 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN-NEXT: ; %bb.2: ; %itofp-if-else diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll index c6aa2182aec80..c5198cdb421a5 100644 --- a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll @@ -13,15 +13,16 @@ define float @sitofp_i128_to_f32(i128 %x) { ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc ; SDAG-NEXT: s_cbranch_execz .LBB0_14 ; SDAG-NEXT: ; %bb.1: ; %itofp-if-end -; SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v3 -; SDAG-NEXT: v_xor_b32_e32 v0, v5, v0 -; SDAG-NEXT: v_xor_b32_e32 v1, v5, v1 -; SDAG-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v5 -; SDAG-NEXT: v_xor_b32_e32 v2, v5, v2 -; SDAG-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc -; SDAG-NEXT: v_xor_b32_e32 v6, v5, v3 -; SDAG-NEXT: v_subb_co_u32_e32 v4, vcc, v2, v5, vcc -; SDAG-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc +; SDAG-NEXT: v_sub_co_u32_e32 v4, vcc, 0, v0 +; SDAG-NEXT: v_subb_co_u32_e32 v5, vcc, 0, v1, vcc +; SDAG-NEXT: v_subb_co_u32_e32 v6, vcc, 0, v2, vcc +; SDAG-NEXT: v_subb_co_u32_e32 v7, vcc, 0, v3, vcc +; SDAG-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] +; SDAG-NEXT: ; implicit-def: $vgpr8 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; SDAG-NEXT: v_ffbh_u32_e32 v2, v4 ; SDAG-NEXT: v_add_u32_e32 v2, 32, v2 ; SDAG-NEXT: v_ffbh_u32_e32 v6, v5 @@ -36,7 +37,6 @@ define float @sitofp_i128_to_f32(i128 %x) { ; SDAG-NEXT: v_sub_u32_e32 v6, 0x80, v7 ; SDAG-NEXT: v_sub_u32_e32 v2, 0x7f, v7 ; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v6 -; SDAG-NEXT: ; implicit-def: $vgpr8 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SDAG-NEXT: ; %bb.2: ; %itofp-if-else @@ -524,16 +524,17 @@ define double @sitofp_i128_to_f64(i128 %x) { ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc ; SDAG-NEXT: s_cbranch_execz .LBB2_14 ; SDAG-NEXT: ; %bb.1: ; %itofp-if-end -; SDAG-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; SDAG-NEXT: v_xor_b32_e32 v4, v0, v4 -; SDAG-NEXT: v_xor_b32_e32 v5, v0, v5 -; SDAG-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v0 -; SDAG-NEXT: v_xor_b32_e32 v2, v0, v2 -; SDAG-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v0, vcc -; SDAG-NEXT: v_xor_b32_e32 v1, v0, v3 -; SDAG-NEXT: v_subb_co_u32_e32 v6, vcc, v2, v0, vcc -; SDAG-NEXT: v_subb_co_u32_e32 v7, vcc, v1, v0, vcc +; SDAG-NEXT: v_sub_co_u32_e32 v0, vcc, 0, v4 +; SDAG-NEXT: v_subb_co_u32_e32 v1, vcc, 0, v5, vcc +; SDAG-NEXT: v_subb_co_u32_e32 v6, vcc, 0, v2, vcc +; SDAG-NEXT: v_subb_co_u32_e32 v7, vcc, 0, v3, vcc +; SDAG-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] +; SDAG-NEXT: ; implicit-def: $vgpr10 +; SDAG-NEXT: v_cndmask_b32_e32 v6, v2, v6, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v7, v3, v7, vcc ; SDAG-NEXT: v_ffbh_u32_e32 v0, v6 +; SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc ; SDAG-NEXT: v_add_u32_e32 v0, 32, v0 ; SDAG-NEXT: v_ffbh_u32_e32 v1, v7 ; SDAG-NEXT: v_min_u32_e32 v0, v0, v1 @@ -547,7 +548,6 @@ define double @sitofp_i128_to_f64(i128 %x) { ; SDAG-NEXT: v_sub_u32_e32 v8, 0x80, v9 ; SDAG-NEXT: v_sub_u32_e32 v2, 0x7f, v9 ; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 54, v8 -; SDAG-NEXT: ; implicit-def: $vgpr10 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -1103,15 +1103,16 @@ define half @sitofp_i128_to_f16(i128 %x) { ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc ; SDAG-NEXT: s_cbranch_execz .LBB4_14 ; SDAG-NEXT: ; %bb.1: ; %itofp-if-end -; SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v3 -; SDAG-NEXT: v_xor_b32_e32 v0, v5, v0 -; SDAG-NEXT: v_xor_b32_e32 v1, v5, v1 -; SDAG-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v5 -; SDAG-NEXT: v_xor_b32_e32 v2, v5, v2 -; SDAG-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc -; SDAG-NEXT: v_xor_b32_e32 v6, v5, v3 -; SDAG-NEXT: v_subb_co_u32_e32 v4, vcc, v2, v5, vcc -; SDAG-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc +; SDAG-NEXT: v_sub_co_u32_e32 v4, vcc, 0, v0 +; SDAG-NEXT: v_subb_co_u32_e32 v5, vcc, 0, v1, vcc +; SDAG-NEXT: v_subb_co_u32_e32 v6, vcc, 0, v2, vcc +; SDAG-NEXT: v_subb_co_u32_e32 v7, vcc, 0, v3, vcc +; SDAG-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] +; SDAG-NEXT: ; implicit-def: $vgpr8 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; SDAG-NEXT: v_ffbh_u32_e32 v2, v4 ; SDAG-NEXT: v_add_u32_e32 v2, 32, v2 ; SDAG-NEXT: v_ffbh_u32_e32 v6, v5 @@ -1126,7 +1127,6 @@ define half @sitofp_i128_to_f16(i128 %x) { ; SDAG-NEXT: v_sub_u32_e32 v6, 0x80, v7 ; SDAG-NEXT: v_sub_u32_e32 v2, 0x7f, v7 ; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v6 -; SDAG-NEXT: ; implicit-def: $vgpr8 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SDAG-NEXT: ; %bb.2: ; %itofp-if-else diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll index b068d87c4d6f4..6b036f675929e 100644 --- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll @@ -10,29 +10,31 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-LABEL: v_srem_i128_vv: ; GFX9: ; %bb.0: ; %_udiv-special-cases ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, 0, v0 +; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, 0, v1, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, 0, v2, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v3, vcc +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] ; GFX9-NEXT: v_ashrrev_i32_e32 v20, 31, v3 -; GFX9-NEXT: v_xor_b32_e32 v0, v0, v20 -; GFX9-NEXT: v_xor_b32_e32 v10, v2, v20 -; GFX9-NEXT: v_xor_b32_e32 v1, v1, v20 -; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v20 -; GFX9-NEXT: v_xor_b32_e32 v9, v3, v20 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v20, vcc -; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v10, v20, vcc -; GFX9-NEXT: v_xor_b32_e32 v4, v4, v8 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v9, v20, vcc -; GFX9-NEXT: v_xor_b32_e32 v5, v5, v8 -; GFX9-NEXT: v_sub_co_u32_e32 v23, vcc, v4, v8 -; GFX9-NEXT: v_xor_b32_e32 v6, v6, v8 -; GFX9-NEXT: v_subb_co_u32_e32 v21, vcc, v5, v8, vcc -; GFX9-NEXT: v_xor_b32_e32 v7, v7, v8 -; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v6, v8, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v8, vcc -; GFX9-NEXT: v_or_b32_e32 v7, v21, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc +; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, 0, v4 +; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, 0, v5, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, 0, v6, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v7, vcc +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v21, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v5, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v23, v4, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v11, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v10, vcc +; GFX9-NEXT: v_or_b32_e32 v7, v22, v5 ; GFX9-NEXT: v_or_b32_e32 v6, v23, v4 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] -; GFX9-NEXT: v_or_b32_e32 v7, v3, v1 -; GFX9-NEXT: v_or_b32_e32 v6, v2, v0 +; GFX9-NEXT: v_or_b32_e32 v7, v1, v3 +; GFX9-NEXT: v_or_b32_e32 v6, v0, v2 ; GFX9-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[6:7] ; GFX9-NEXT: v_ffbh_u32_e32 v6, v4 ; GFX9-NEXT: v_add_u32_e32 v6, 32, v6 @@ -40,38 +42,37 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_min_u32_e32 v6, v6, v7 ; GFX9-NEXT: v_ffbh_u32_e32 v7, v23 ; GFX9-NEXT: v_add_u32_e32 v7, 32, v7 -; GFX9-NEXT: v_ffbh_u32_e32 v8, v21 +; GFX9-NEXT: v_ffbh_u32_e32 v8, v22 ; GFX9-NEXT: v_min_u32_e32 v7, v7, v8 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 64, v7 ; GFX9-NEXT: v_addc_co_u32_e64 v8, s[6:7], 0, 0, vcc ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX9-NEXT: v_ffbh_u32_e32 v9, v1 +; GFX9-NEXT: v_ffbh_u32_e32 v10, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc -; GFX9-NEXT: v_ffbh_u32_e32 v7, v0 +; GFX9-NEXT: v_ffbh_u32_e32 v7, v2 ; GFX9-NEXT: v_add_u32_e32 v7, 32, v7 -; GFX9-NEXT: v_min_u32_e32 v7, v7, v9 -; GFX9-NEXT: v_ffbh_u32_e32 v9, v2 -; GFX9-NEXT: v_add_u32_e32 v9, 32, v9 -; GFX9-NEXT: v_ffbh_u32_e32 v10, v3 -; GFX9-NEXT: v_min_u32_e32 v9, v9, v10 +; GFX9-NEXT: v_min_u32_e32 v7, v7, v10 +; GFX9-NEXT: v_ffbh_u32_e32 v10, v0 +; GFX9-NEXT: v_add_u32_e32 v10, 32, v10 +; GFX9-NEXT: v_ffbh_u32_e32 v11, v1 +; GFX9-NEXT: v_min_u32_e32 v10, v10, v11 ; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc -; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, 64, v9 -; GFX9-NEXT: v_addc_co_u32_e64 v10, s[6:7], 0, 0, vcc -; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f -; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, 0, vcc -; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v7 -; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v8, v10, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 64, v10 +; GFX9-NEXT: v_addc_co_u32_e64 v11, s[6:7], 0, 0, vcc +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, 0, vcc +; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v7 +; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v8, v11, vcc ; GFX9-NEXT: v_subbrev_co_u32_e32 v8, vcc, 0, v9, vcc ; GFX9-NEXT: v_subbrev_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[6:7] ; GFX9-NEXT: v_or_b32_e32 v13, v7, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] -; GFX9-NEXT: v_mov_b32_e32 v22, v20 ; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] ; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc @@ -82,10 +83,10 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13] ; GFX9-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; GFX9-NEXT: v_cndmask_b32_e64 v11, v1, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v12, v0, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v10, v3, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v13, v2, 0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v11, v3, 0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v12, v2, 0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v10, v1, 0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v13, v0, 0, s[4:5] ; GFX9-NEXT: s_and_b64 s[4:5], s[6:7], vcc ; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB0_6 @@ -98,22 +99,22 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_sub_u32_e32 v11, 64, v13 ; GFX9-NEXT: v_or_b32_e32 v8, v25, v27 ; GFX9-NEXT: v_or_b32_e32 v7, v24, v26 -; GFX9-NEXT: v_lshlrev_b64 v[9:10], v13, v[0:1] -; GFX9-NEXT: v_lshrrev_b64 v[11:12], v11, v[2:3] +; GFX9-NEXT: v_lshlrev_b64 v[9:10], v13, v[2:3] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], v11, v[0:1] ; GFX9-NEXT: v_sub_u32_e32 v6, 63, v6 ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[7:8] -; GFX9-NEXT: v_lshlrev_b64 v[6:7], v6, v[2:3] +; GFX9-NEXT: v_lshlrev_b64 v[6:7], v6, v[0:1] ; GFX9-NEXT: v_or_b32_e32 v8, v10, v12 ; GFX9-NEXT: v_or_b32_e32 v9, v9, v11 ; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v13 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v13 -; GFX9-NEXT: v_lshlrev_b64 v[12:13], v13, v[2:3] +; GFX9-NEXT: v_lshlrev_b64 v[12:13], v13, v[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: v_mov_b32_e32 v10, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v1, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v0, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, v13, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: v_mov_b32_e32 v11, 0 @@ -123,23 +124,23 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: s_cbranch_execz .LBB0_5 ; GFX9-NEXT: ; %bb.2: ; %udiv-preheader ; GFX9-NEXT: v_sub_u32_e32 v10, 64, v24 -; GFX9-NEXT: v_lshrrev_b64 v[8:9], v24, v[2:3] -; GFX9-NEXT: v_lshlrev_b64 v[10:11], v10, v[0:1] +; GFX9-NEXT: v_lshrrev_b64 v[8:9], v24, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[10:11], v10, v[2:3] ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24 ; GFX9-NEXT: v_or_b32_e32 v10, v8, v10 ; GFX9-NEXT: v_subrev_u32_e32 v8, 64, v24 ; GFX9-NEXT: v_or_b32_e32 v11, v9, v11 -; GFX9-NEXT: v_lshrrev_b64 v[8:9], v8, v[0:1] +; GFX9-NEXT: v_lshrrev_b64 v[8:9], v8, v[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v15, v9, v3, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v15, v9, v1, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v10, v8, v10, vcc -; GFX9-NEXT: v_lshrrev_b64 v[8:9], v24, v[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v14, v10, v2, s[4:5] +; GFX9-NEXT: v_lshrrev_b64 v[8:9], v24, v[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v14, v10, v0, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v9, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v16, 0, v8, vcc ; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, -1, v23 -; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, -1, v21, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, -1, v22, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v30, vcc, -1, v4, vcc ; GFX9-NEXT: v_mov_b32_e32 v18, 0 ; GFX9-NEXT: v_mov_b32_e32 v10, 0 @@ -168,7 +169,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_or_b32_e32 v12, v18, v12 ; GFX9-NEXT: v_and_b32_e32 v18, v8, v23 ; GFX9-NEXT: v_or_b32_e32 v13, v19, v13 -; GFX9-NEXT: v_and_b32_e32 v19, v8, v21 +; GFX9-NEXT: v_and_b32_e32 v19, v8, v22 ; GFX9-NEXT: v_sub_co_u32_e32 v14, vcc, v14, v18 ; GFX9-NEXT: v_and_b32_e32 v32, v8, v4 ; GFX9-NEXT: v_subb_co_u32_e32 v15, vcc, v15, v19, vcc @@ -207,7 +208,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_mov_b32_e32 v15, 0 ; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v14, v6 -; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v21, v13, v[14:15] +; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v22, v13, v[14:15] ; GFX9-NEXT: v_mul_lo_u32 v9, v10, v4 ; GFX9-NEXT: v_mul_lo_u32 v11, v11, v23 ; GFX9-NEXT: v_mov_b32_e32 v4, v14 @@ -218,272 +219,283 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_mov_b32_e32 v8, v14 ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v4, v8 ; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], 0, 0, vcc -; GFX9-NEXT: v_mul_lo_u32 v12, v12, v21 -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v21, v10, v[8:9] +; GFX9-NEXT: v_mul_lo_u32 v12, v12, v22 +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v22, v10, v[8:9] ; GFX9-NEXT: v_add3_u32 v4, v11, v7, v12 ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v8, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v4, vcc ; GFX9-NEXT: v_mov_b32_e32 v7, v13 -; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v7, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v6, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v4, vcc -; GFX9-NEXT: v_xor_b32_e32 v5, v0, v20 -; GFX9-NEXT: v_xor_b32_e32 v0, v2, v20 -; GFX9-NEXT: v_xor_b32_e32 v4, v1, v22 -; GFX9-NEXT: v_xor_b32_e32 v1, v3, v22 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v5 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v6, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc +; GFX9-NEXT: v_xor_b32_e32 v0, v0, v20 +; GFX9-NEXT: v_xor_b32_e32 v1, v1, v21 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v20 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v22, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v5, v20, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v22, vcc +; GFX9-NEXT: v_xor_b32_e32 v2, v2, v20 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v21, vcc +; GFX9-NEXT: v_xor_b32_e32 v3, v3, v21 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v20, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v21, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-O0-LABEL: v_srem_i128_vv: ; GFX9-O0: ; %bb.0: ; %_udiv-special-cases ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2 +; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v9 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v3 +; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v3 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: s_mov_b32 s4, 63 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-O0-NEXT: v_ashrrev_i64 v[11:12], s4, v[10:11] -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v13 +; GFX9-O0-NEXT: v_ashrrev_i64 v[3:4], s4, v[3:4] +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v6 -; GFX9-O0-NEXT: v_ashrrev_i64 v[15:16], s4, v[13:14] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v12 -; GFX9-O0-NEXT: v_xor_b32_e64 v3, v3, v10 -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v11 -; GFX9-O0-NEXT: v_xor_b32_e64 v13, v8, v12 -; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-O0-NEXT: v_xor_b32_e64 v3, v3, v10 -; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec -; GFX9-O0-NEXT: v_xor_b32_e64 v1, v1, v12 -; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v16 -; GFX9-O0-NEXT: v_xor_b32_e64 v9, v8, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v15 -; GFX9-O0-NEXT: v_xor_b32_e64 v7, v7, v6 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 -; GFX9-O0-NEXT: v_xor_b32_e64 v9, v9, v3 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, v6 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v14 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v9, vcc, v9, v12 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v10, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v13, vcc, v11, v12, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v10, vcc +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v13 +; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 0 +; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 1 +; GFX9-O0-NEXT: s_mov_b32 s10, s6 +; GFX9-O0-NEXT: v_writelane_b32 v0, s10, 2 +; GFX9-O0-NEXT: s_mov_b32 s11, s7 +; GFX9-O0-NEXT: v_writelane_b32 v0, s11, 3 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v10, vcc, s10, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v5, v4, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s10 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v1, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v2, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v2 +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 +; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX9-O0-NEXT: v_cmp_lt_i64_e64 s[4:5], v[12:13], s[4:5] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[4:5] +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[4:5] +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[4:5] ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v6 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v3, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v5, v6, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v3, vcc, v2, v3, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v19 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v14, vcc, s10, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s11 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v12, vcc, v11, v10, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s10 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v8, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v13, s11 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v13, vcc, v13, v9, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v15 +; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX9-O0-NEXT: v_cmp_lt_i64_e64 s[4:5], v[18:19], s[4:5] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v10, v10, v12, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v14 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[4:5] +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v10 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v12 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, v9, v13, s[4:5] +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 killed $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v8, v11, s[4:5] ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v5 +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v17 +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v19 +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v19 +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v5 +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v17 +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-O0-NEXT: v_or_b32_e64 v3, v8, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O0-NEXT: v_or_b32_e64 v1, v5, v6 -; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 -; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 0 -; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 1 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[1:2], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 -; GFX9-O0-NEXT: v_or_b32_e64 v15, v4, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 -; GFX9-O0-NEXT: v_or_b32_e64 v9, v3, v1 -; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v15 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[9:10], s[6:7] +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v19 +; GFX9-O0-NEXT: v_or_b32_e64 v15, v13, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v18 +; GFX9-O0-NEXT: v_or_b32_e64 v13, v13, v14 +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[13:14], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 +; GFX9-O0-NEXT: v_or_b32_e64 v15, v13, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v16 +; GFX9-O0-NEXT: v_or_b32_e64 v13, v13, v14 +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[13:14], s[6:7] ; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v6 -; GFX9-O0-NEXT: s_mov_b32 s9, 32 -; GFX9-O0-NEXT: v_add_u32_e64 v6, v6, s9 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v7, v7 -; GFX9-O0-NEXT: v_min_u32_e64 v6, v6, v7 -; GFX9-O0-NEXT: s_mov_b32 s8, 0 -; GFX9-O0-NEXT: ; implicit-def: $sgpr10 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s8 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v5 -; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s9 +; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[11:12], s[8:9] ; GFX9-O0-NEXT: v_ffbh_u32_e64 v8, v8 -; GFX9-O0-NEXT: v_min_u32_e64 v15, v5, v8 -; GFX9-O0-NEXT: ; implicit-def: $sgpr10 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s8 -; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v5 -; GFX9-O0-NEXT: s_mov_b64 s[10:11], 64 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v15 -; GFX9-O0-NEXT: s_mov_b32 s12, s10 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v16 -; GFX9-O0-NEXT: s_mov_b32 s14, s11 -; GFX9-O0-NEXT: v_add_co_u32_e64 v8, s[12:13], v8, s12 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s14 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[12:13], v5, v9, s[12:13] +; GFX9-O0-NEXT: s_mov_b32 s13, 32 +; GFX9-O0-NEXT: v_add_u32_e64 v8, v8, s13 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v9, v9 +; GFX9-O0-NEXT: v_min_u32_e64 v8, v8, v9 +; GFX9-O0-NEXT: s_mov_b32 s12, 0 +; GFX9-O0-NEXT: ; implicit-def: $sgpr14 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s12 ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[11:12], s[12:13] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[12:13] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, v6, v7, s[12:13] -; GFX9-O0-NEXT: ; implicit-def: $sgpr12 -; GFX9-O0-NEXT: ; implicit-def: $sgpr12 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v9 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v7, v7 +; GFX9-O0-NEXT: v_add_u32_e64 v7, v7, s13 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v10 +; GFX9-O0-NEXT: v_min_u32_e64 v13, v7, v10 +; GFX9-O0-NEXT: ; implicit-def: $sgpr14 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, s12 +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v7 +; GFX9-O0-NEXT: s_mov_b64 s[14:15], 64 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v13 +; GFX9-O0-NEXT: s_mov_b32 s16, s14 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-O0-NEXT: s_mov_b32 s18, s15 +; GFX9-O0-NEXT: v_add_co_u32_e64 v10, s[16:17], v10, s16 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s18 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v7, s[16:17], v7, v11, s[16:17] +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v10 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, v8, v9, s[8:9] +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v7 +; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[5:6], s[8:9] ; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v1 -; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s9 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s13 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v2 ; GFX9-O0-NEXT: v_min_u32_e64 v6, v5, v6 -; GFX9-O0-NEXT: ; implicit-def: $sgpr12 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr16 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s12 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v3 -; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s9 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s13 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v11, v4 -; GFX9-O0-NEXT: v_min_u32_e64 v15, v5, v11 -; GFX9-O0-NEXT: ; implicit-def: $sgpr9 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s8 -; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v15 -; GFX9-O0-NEXT: s_mov_b32 s8, s10 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v16 -; GFX9-O0-NEXT: s_mov_b32 s10, s11 -; GFX9-O0-NEXT: v_add_co_u32_e64 v11, s[8:9], v11, s8 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, s10 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[8:9], v5, v12, s[8:9] +; GFX9-O0-NEXT: v_min_u32_e64 v12, v5, v11 +; GFX9-O0-NEXT: ; implicit-def: $sgpr13 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s12 +; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v12 +; GFX9-O0-NEXT: s_mov_b32 s12, s14 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v13 +; GFX9-O0-NEXT: s_mov_b32 s14, s15 +; GFX9-O0-NEXT: v_add_co_u32_e64 v11, s[12:13], v11, s12 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s14 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[12:13], v5, v12, s[12:13] ; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[13:14], s[8:9] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 @@ -496,8 +508,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 -; GFX9-O0-NEXT: s_mov_b32 s10, s6 -; GFX9-O0-NEXT: s_mov_b32 s11, s7 ; GFX9-O0-NEXT: v_sub_co_u32_e32 v5, vcc, v5, v8 ; GFX9-O0-NEXT: v_subb_co_u32_e32 v9, vcc, v6, v7, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v7, s10 @@ -580,75 +590,75 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[4:5], exec -; GFX9-O0-NEXT: v_writelane_b32 v0, s4, 2 -; GFX9-O0-NEXT: v_writelane_b32 v0, s5, 3 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: v_writelane_b32 v0, s4, 4 +; GFX9-O0-NEXT: v_writelane_b32 v0, s5, 5 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-O0-NEXT: s_branch .LBB0_8 ; GFX9-O0-NEXT: .LBB0_1: ; %Flow -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v0, 4 -; GFX9-O0-NEXT: v_readlane_b32 s5, v0, 5 +; GFX9-O0-NEXT: v_readlane_b32 s4, v0, 6 +; GFX9-O0-NEXT: v_readlane_b32 s5, v0, 7 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: ; %bb.2: ; %Flow -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(6) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_5 ; GFX9-O0-NEXT: .LBB0_3: ; %Flow2 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v4, 2 -; GFX9-O0-NEXT: v_readlane_b32 s5, v4, 3 +; GFX9-O0-NEXT: v_readlane_b32 s4, v4, 4 +; GFX9-O0-NEXT: v_readlane_b32 s5, v4, 5 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_9 ; GFX9-O0-NEXT: .LBB0_4: ; %udiv-loop-exit -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 1 ; GFX9-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[0:1] @@ -681,67 +691,67 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_3 ; GFX9-O0-NEXT: .LBB0_5: ; %Flow1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 ; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v8, 6 -; GFX9-O0-NEXT: v_readlane_b32 s5, v8, 7 +; GFX9-O0-NEXT: v_readlane_b32 s4, v8, 8 +; GFX9-O0-NEXT: v_readlane_b32 s5, v8, 9 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_4 ; GFX9-O0-NEXT: .LBB0_6: ; %udiv-do-while ; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 ; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s6, v16, 8 -; GFX9-O0-NEXT: v_readlane_b32 s7, v16, 9 -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s6, v16, 10 +; GFX9-O0-NEXT: v_readlane_b32 s7, v16, 11 +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 63 ; GFX9-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-O0-NEXT: v_lshrrev_b64 v[29:30], s4, v[2:3] @@ -881,72 +891,72 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v18, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v2 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v18, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v18, v15 ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v14 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v18, v13 ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v12 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 4 -; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 5 +; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 6 +; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 7 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 8 -; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 9 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 10 +; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 11 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 ; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execnz .LBB0_6 ; GFX9-O0-NEXT: s_branch .LBB0_1 ; GFX9-O0-NEXT: .LBB0_7: ; %udiv-preheader -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 ; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload @@ -1027,51 +1037,51 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 ; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v15, s9 ; GFX9-O0-NEXT: v_mov_b32_e32 v14, s8 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, s7 ; GFX9-O0-NEXT: v_mov_b32_e32 v12, s6 -; GFX9-O0-NEXT: v_writelane_b32 v16, s4, 8 -; GFX9-O0-NEXT: v_writelane_b32 v16, s5, 9 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: v_writelane_b32 v16, s4, 10 +; GFX9-O0-NEXT: v_writelane_b32 v16, s5, 11 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 ; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_6 ; GFX9-O0-NEXT: .LBB0_8: ; %udiv-bb1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload @@ -1108,14 +1118,14 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b32 s4, 0x7f ; GFX9-O0-NEXT: v_sub_u32_e64 v3, s4, v4 ; GFX9-O0-NEXT: v_lshlrev_b64 v[5:6], v3, v[11:12] @@ -1161,12 +1171,12 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v3 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 ; GFX9-O0-NEXT: v_or_b32_e64 v3, v3, v4 @@ -1181,33 +1191,33 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s9 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec ; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] -; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 6 -; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 7 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 8 +; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 9 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execz .LBB0_5 ; GFX9-O0-NEXT: s_branch .LBB0_7 ; GFX9-O0-NEXT: .LBB0_9: ; %udiv-end -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload @@ -1218,10 +1228,10 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 32 @@ -1495,11 +1505,11 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 ; GFX9-O0-NEXT: ; kill: killed $vgpr4 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll index ea30a63b0be19..6372d74161fad 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll @@ -28,34 +28,33 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GCN-NEXT: v_xor_b32_e32 v1, v1, v2 -; GCN-NEXT: v_cvt_f32_u32_e32 v3, v1 -; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v1 -; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v0 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, 0, v1 +; GCN-NEXT: v_max_i32_e32 v2, v1, v2 +; GCN-NEXT: v_cvt_f32_u32_e32 v3, v2 +; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v2 +; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0, v0 ; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; GCN-NEXT: v_xor_b32_e32 v0, v0, v5 +; GCN-NEXT: v_max_i32_e32 v5, v0, v5 +; GCN-NEXT: v_xor_b32_e32 v0, v0, v1 +; GCN-NEXT: v_ashrrev_i32_e32 v0, 31, v0 ; GCN-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GCN-NEXT: v_xor_b32_e32 v2, v5, v2 ; GCN-NEXT: v_mul_lo_u32 v4, v4, v3 ; GCN-NEXT: v_mul_hi_u32 v4, v3, v4 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GCN-NEXT: v_mul_hi_u32 v3, v0, v3 -; GCN-NEXT: v_mul_lo_u32 v4, v3, v1 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v3 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GCN-NEXT: v_sub_i32_e32 v4, vcc, v0, v1 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN-NEXT: v_mul_hi_u32 v3, v5, v3 +; GCN-NEXT: v_mul_lo_u32 v1, v3, v2 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v3 +; GCN-NEXT: v_sub_i32_e32 v1, vcc, v5, v1 +; GCN-NEXT: v_sub_i32_e32 v5, vcc, v1, v2 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v3 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 -; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; GCN-NEXT: v_xor_b32_e32 v0, v0, v2 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 +; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GCN-NEXT: v_xor_b32_e32 v1, v1, v0 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; @@ -73,34 +72,33 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; TONGA-NEXT: s_mov_b32 s0, s4 ; TONGA-NEXT: s_mov_b32 s1, s5 ; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; TONGA-NEXT: v_xor_b32_e32 v1, v1, v2 -; TONGA-NEXT: v_cvt_f32_u32_e32 v3, v1 -; TONGA-NEXT: v_sub_u32_e32 v4, vcc, 0, v1 -; TONGA-NEXT: v_ashrrev_i32_e32 v5, 31, v0 +; TONGA-NEXT: v_sub_u32_e32 v2, vcc, 0, v1 +; TONGA-NEXT: v_max_i32_e32 v2, v1, v2 +; TONGA-NEXT: v_cvt_f32_u32_e32 v3, v2 +; TONGA-NEXT: v_sub_u32_e32 v4, vcc, 0, v2 +; TONGA-NEXT: v_sub_u32_e32 v5, vcc, 0, v0 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v5 -; TONGA-NEXT: v_xor_b32_e32 v0, v0, v5 +; TONGA-NEXT: v_max_i32_e32 v5, v0, v5 +; TONGA-NEXT: v_xor_b32_e32 v0, v0, v1 +; TONGA-NEXT: v_ashrrev_i32_e32 v0, 31, v0 ; TONGA-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; TONGA-NEXT: v_cvt_u32_f32_e32 v3, v3 -; TONGA-NEXT: v_xor_b32_e32 v2, v5, v2 ; TONGA-NEXT: v_mul_lo_u32 v4, v4, v3 ; TONGA-NEXT: v_mul_hi_u32 v4, v3, v4 ; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v4 -; TONGA-NEXT: v_mul_hi_u32 v3, v0, v3 -; TONGA-NEXT: v_mul_lo_u32 v4, v3, v1 -; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v3 -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v4 -; TONGA-NEXT: v_sub_u32_e32 v4, vcc, v0, v1 -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 -; TONGA-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; TONGA-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; TONGA-NEXT: v_mul_hi_u32 v3, v5, v3 +; TONGA-NEXT: v_mul_lo_u32 v1, v3, v2 +; TONGA-NEXT: v_add_u32_e32 v4, vcc, 1, v3 +; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v5, v1 +; TONGA-NEXT: v_sub_u32_e32 v5, vcc, v1, v2 +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 +; TONGA-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; TONGA-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; TONGA-NEXT: v_add_u32_e32 v4, vcc, 1, v3 -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 -; TONGA-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; TONGA-NEXT: v_xor_b32_e32 v0, v0, v2 -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 +; TONGA-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; TONGA-NEXT: v_xor_b32_e32 v1, v1, v0 +; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v1, v0 ; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; TONGA-NEXT: s_endpgm ; @@ -115,40 +113,37 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GFX9-NEXT: s_mov_b32 s8, s6 ; GFX9-NEXT: s_mov_b32 s9, s7 ; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 -; GFX9-NEXT: s_ashr_i32 s6, s0, 31 -; GFX9-NEXT: s_add_i32 s0, s0, s6 -; GFX9-NEXT: s_xor_b32 s7, s0, s6 +; GFX9-NEXT: v_readfirstlane_b32 s6, v1 +; GFX9-NEXT: s_abs_i32 s7, s6 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: s_ashr_i32 s5, s4, 31 +; GFX9-NEXT: s_xor_b32 s5, s4, s6 +; GFX9-NEXT: s_sub_i32 s6, 0, s7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: s_add_i32 s4, s4, s5 -; GFX9-NEXT: s_xor_b32 s6, s5, s6 -; GFX9-NEXT: s_xor_b32 s4, s4, s5 +; GFX9-NEXT: s_abs_i32 s4, s4 +; GFX9-NEXT: s_ashr_i32 s5, s5, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s5, 0, s7 ; GFX9-NEXT: v_readfirstlane_b32 s8, v0 -; GFX9-NEXT: s_mul_i32 s5, s5, s8 -; GFX9-NEXT: s_mul_hi_u32 s5, s8, s5 -; GFX9-NEXT: s_add_i32 s8, s8, s5 -; GFX9-NEXT: s_mul_hi_u32 s5, s4, s8 -; GFX9-NEXT: s_mul_i32 s8, s5, s7 +; GFX9-NEXT: s_mul_i32 s6, s6, s8 +; GFX9-NEXT: s_mul_hi_u32 s6, s8, s6 +; GFX9-NEXT: s_add_i32 s8, s8, s6 +; GFX9-NEXT: s_mul_hi_u32 s6, s4, s8 +; GFX9-NEXT: s_mul_i32 s8, s6, s7 ; GFX9-NEXT: s_sub_i32 s4, s4, s8 -; GFX9-NEXT: s_add_i32 s9, s5, 1 +; GFX9-NEXT: s_add_i32 s9, s6, 1 ; GFX9-NEXT: s_sub_i32 s8, s4, s7 ; GFX9-NEXT: s_cmp_ge_u32 s4, s7 -; GFX9-NEXT: s_cselect_b32 s5, s9, s5 +; GFX9-NEXT: s_cselect_b32 s6, s9, s6 ; GFX9-NEXT: s_cselect_b32 s4, s8, s4 -; GFX9-NEXT: s_add_i32 s8, s5, 1 +; GFX9-NEXT: s_add_i32 s8, s6, 1 ; GFX9-NEXT: s_cmp_ge_u32 s4, s7 -; GFX9-NEXT: s_cselect_b32 s4, s8, s5 -; GFX9-NEXT: s_xor_b32 s4, s4, s6 -; GFX9-NEXT: s_sub_i32 s4, s4, s6 +; GFX9-NEXT: s_cselect_b32 s4, s8, s6 +; GFX9-NEXT: s_xor_b32 s4, s4, s5 +; GFX9-NEXT: s_sub_i32 s4, s4, s5 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -408,62 +403,60 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v2 -; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v0 -; GCN-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; GCN-NEXT: v_xor_b32_e32 v2, v2, v5 -; GCN-NEXT: v_xor_b32_e32 v3, v3, v7 -; GCN-NEXT: v_xor_b32_e32 v8, v4, v5 -; GCN-NEXT: v_xor_b32_e32 v9, v6, v7 -; GCN-NEXT: v_cvt_f32_u32_e32 v5, v2 -; GCN-NEXT: v_cvt_f32_u32_e32 v7, v3 -; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v2 -; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GCN-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; GCN-NEXT: v_sub_i32_e32 v11, vcc, 0, v3 +; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v2 +; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v3 +; GCN-NEXT: v_xor_b32_e32 v4, v0, v2 +; GCN-NEXT: v_xor_b32_e32 v7, v1, v3 +; GCN-NEXT: v_max_i32_e32 v2, v2, v6 +; GCN-NEXT: v_max_i32_e32 v3, v3, v9 +; GCN-NEXT: v_cvt_f32_u32_e32 v6, v2 +; GCN-NEXT: v_cvt_f32_u32_e32 v9, v3 +; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0, v0 +; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GCN-NEXT: v_max_i32_e32 v0, v0, v5 +; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v9 +; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v2 +; GCN-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GCN-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5 -; GCN-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 +; GCN-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v3 +; GCN-NEXT: v_mul_lo_u32 v9, v9, v6 ; GCN-NEXT: v_mul_lo_u32 v10, v10, v5 -; GCN-NEXT: v_mul_lo_u32 v11, v11, v7 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v6 -; GCN-NEXT: v_xor_b32_e32 v0, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v4, v5, v10 -; GCN-NEXT: v_xor_b32_e32 v1, v1, v6 -; GCN-NEXT: v_mul_hi_u32 v6, v7, v11 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v6 -; GCN-NEXT: v_mul_hi_u32 v4, v0, v4 +; GCN-NEXT: v_sub_i32_e32 v8, vcc, 0, v1 +; GCN-NEXT: v_mul_hi_u32 v9, v6, v9 +; GCN-NEXT: v_max_i32_e32 v1, v1, v8 +; GCN-NEXT: v_mul_hi_u32 v8, v5, v10 +; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v4 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GCN-NEXT: v_mul_hi_u32 v6, v0, v6 ; GCN-NEXT: v_mul_hi_u32 v5, v1, v5 -; GCN-NEXT: v_mul_lo_u32 v6, v4, v2 +; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v7 +; GCN-NEXT: v_mul_lo_u32 v8, v6, v2 ; GCN-NEXT: v_mul_lo_u32 v10, v5, v3 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v4 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 1, v6 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 ; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v10 ; GCN-NEXT: v_add_i32_e32 v11, vcc, 1, v5 ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2 ; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3 -; GCN-NEXT: v_sub_i32_e32 v6, vcc, v0, v2 -; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1] -; GCN-NEXT: v_sub_i32_e32 v7, vcc, v1, v3 +; GCN-NEXT: v_sub_i32_e32 v8, vcc, v0, v2 +; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[0:1] +; GCN-NEXT: v_sub_i32_e32 v9, vcc, v1, v3 ; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[2:3] -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] -; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v4 -; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[2:3] -; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v5 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[0:1] +; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v6 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[2:3] +; GCN-NEXT: v_add_i32_e32 v9, vcc, 1, v5 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 -; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc -; GCN-NEXT: v_xor_b32_e32 v0, v0, v8 -; GCN-NEXT: v_xor_b32_e32 v1, v1, v9 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v9 +; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc +; GCN-NEXT: v_xor_b32_e32 v0, v0, v4 +; GCN-NEXT: v_xor_b32_e32 v1, v1, v7 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v7 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; @@ -481,62 +474,60 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: s_mov_b32 s4, s0 ; TONGA-NEXT: s_mov_b32 s5, s1 ; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_ashrrev_i32_e32 v5, 31, v2 -; TONGA-NEXT: v_ashrrev_i32_e32 v7, 31, v3 -; TONGA-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v7 -; TONGA-NEXT: v_ashrrev_i32_e32 v4, 31, v0 -; TONGA-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; TONGA-NEXT: v_xor_b32_e32 v2, v2, v5 -; TONGA-NEXT: v_xor_b32_e32 v3, v3, v7 -; TONGA-NEXT: v_xor_b32_e32 v8, v4, v5 -; TONGA-NEXT: v_xor_b32_e32 v9, v6, v7 -; TONGA-NEXT: v_cvt_f32_u32_e32 v5, v2 -; TONGA-NEXT: v_cvt_f32_u32_e32 v7, v3 -; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v2 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; TONGA-NEXT: v_sub_u32_e32 v11, vcc, 0, v3 +; TONGA-NEXT: v_sub_u32_e32 v6, vcc, 0, v2 +; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v3 +; TONGA-NEXT: v_xor_b32_e32 v4, v0, v2 +; TONGA-NEXT: v_xor_b32_e32 v7, v1, v3 +; TONGA-NEXT: v_max_i32_e32 v2, v2, v6 +; TONGA-NEXT: v_max_i32_e32 v3, v3, v9 +; TONGA-NEXT: v_cvt_f32_u32_e32 v6, v2 +; TONGA-NEXT: v_cvt_f32_u32_e32 v9, v3 +; TONGA-NEXT: v_sub_u32_e32 v5, vcc, 0, v0 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; TONGA-NEXT: v_max_i32_e32 v0, v0, v5 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v5, v9 +; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v2 +; TONGA-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; TONGA-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5 -; TONGA-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 +; TONGA-NEXT: v_cvt_u32_f32_e32 v6, v6 ; TONGA-NEXT: v_cvt_u32_f32_e32 v5, v5 -; TONGA-NEXT: v_cvt_u32_f32_e32 v7, v7 -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v4 +; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v3 +; TONGA-NEXT: v_mul_lo_u32 v9, v9, v6 ; TONGA-NEXT: v_mul_lo_u32 v10, v10, v5 -; TONGA-NEXT: v_mul_lo_u32 v11, v11, v7 -; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v6 -; TONGA-NEXT: v_xor_b32_e32 v0, v0, v4 -; TONGA-NEXT: v_mul_hi_u32 v4, v5, v10 -; TONGA-NEXT: v_xor_b32_e32 v1, v1, v6 -; TONGA-NEXT: v_mul_hi_u32 v6, v7, v11 -; TONGA-NEXT: v_add_u32_e32 v4, vcc, v5, v4 -; TONGA-NEXT: v_add_u32_e32 v5, vcc, v7, v6 -; TONGA-NEXT: v_mul_hi_u32 v4, v0, v4 +; TONGA-NEXT: v_sub_u32_e32 v8, vcc, 0, v1 +; TONGA-NEXT: v_mul_hi_u32 v9, v6, v9 +; TONGA-NEXT: v_max_i32_e32 v1, v1, v8 +; TONGA-NEXT: v_mul_hi_u32 v8, v5, v10 +; TONGA-NEXT: v_ashrrev_i32_e32 v4, 31, v4 +; TONGA-NEXT: v_add_u32_e32 v6, vcc, v6, v9 +; TONGA-NEXT: v_add_u32_e32 v5, vcc, v5, v8 +; TONGA-NEXT: v_mul_hi_u32 v6, v0, v6 ; TONGA-NEXT: v_mul_hi_u32 v5, v1, v5 -; TONGA-NEXT: v_mul_lo_u32 v6, v4, v2 +; TONGA-NEXT: v_ashrrev_i32_e32 v7, 31, v7 +; TONGA-NEXT: v_mul_lo_u32 v8, v6, v2 ; TONGA-NEXT: v_mul_lo_u32 v10, v5, v3 -; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v4 -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v6 +; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v6 +; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v8 ; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v10 ; TONGA-NEXT: v_add_u32_e32 v11, vcc, 1, v5 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3 -; TONGA-NEXT: v_sub_u32_e32 v6, vcc, v0, v2 -; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1] -; TONGA-NEXT: v_sub_u32_e32 v7, vcc, v1, v3 +; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v0, v2 +; TONGA-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[0:1] +; TONGA-NEXT: v_sub_u32_e32 v9, vcc, v1, v3 ; TONGA-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[2:3] -; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] -; TONGA-NEXT: v_add_u32_e32 v6, vcc, 1, v4 -; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[2:3] -; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v5 +; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[0:1] +; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v6 +; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[2:3] +; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v5 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; TONGA-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc +; TONGA-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 -; TONGA-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc -; TONGA-NEXT: v_xor_b32_e32 v0, v0, v8 -; TONGA-NEXT: v_xor_b32_e32 v1, v1, v9 -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v8 -; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v9 +; TONGA-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc +; TONGA-NEXT: v_xor_b32_e32 v0, v0, v4 +; TONGA-NEXT: v_xor_b32_e32 v1, v1, v7 +; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v4 +; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v7 ; TONGA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; TONGA-NEXT: s_endpgm ; @@ -553,69 +544,63 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s0, v2 -; GFX9-NEXT: s_ashr_i32 s1, s0, 31 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_xor_b32 s6, s0, s1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6 +; GFX9-NEXT: s_abs_i32 s1, s0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s1 ; GFX9-NEXT: v_readfirstlane_b32 s7, v0 -; GFX9-NEXT: s_ashr_i32 s8, s7, 31 -; GFX9-NEXT: s_add_i32 s7, s7, s8 +; GFX9-NEXT: s_xor_b32 s0, s7, s0 +; GFX9-NEXT: s_ashr_i32 s8, s0, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX9-NEXT: s_xor_b32 s9, s8, s1 -; GFX9-NEXT: s_xor_b32 s1, s7, s8 -; GFX9-NEXT: s_sub_i32 s7, 0, s6 +; GFX9-NEXT: s_sub_i32 s0, 0, s1 +; GFX9-NEXT: s_abs_i32 s7, s7 +; GFX9-NEXT: v_readfirstlane_b32 s6, v3 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s9, v0 +; GFX9-NEXT: s_mul_i32 s0, s0, s9 +; GFX9-NEXT: s_mul_hi_u32 s0, s9, s0 +; GFX9-NEXT: s_add_i32 s9, s9, s0 +; GFX9-NEXT: s_mul_hi_u32 s0, s7, s9 +; GFX9-NEXT: s_mul_i32 s9, s0, s1 +; GFX9-NEXT: s_sub_i32 s7, s7, s9 +; GFX9-NEXT: s_add_i32 s10, s0, 1 +; GFX9-NEXT: s_sub_i32 s9, s7, s1 +; GFX9-NEXT: s_cmp_ge_u32 s7, s1 +; GFX9-NEXT: s_cselect_b32 s0, s10, s0 +; GFX9-NEXT: s_cselect_b32 s7, s9, s7 +; GFX9-NEXT: s_add_i32 s9, s0, 1 +; GFX9-NEXT: s_cmp_ge_u32 s7, s1 +; GFX9-NEXT: s_cselect_b32 s7, s9, s0 +; GFX9-NEXT: s_abs_i32 s9, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9 ; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: v_readfirstlane_b32 s4, v3 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_xor_b32 s5, s4, s6 +; GFX9-NEXT: s_xor_b32 s6, s7, s8 +; GFX9-NEXT: s_sub_i32 s7, 0, s9 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: s_sub_i32 s6, s6, s8 +; GFX9-NEXT: s_abs_i32 s4, s4 +; GFX9-NEXT: s_ashr_i32 s5, s5, 31 ; GFX9-NEXT: v_readfirstlane_b32 s8, v0 ; GFX9-NEXT: s_mul_i32 s7, s7, s8 ; GFX9-NEXT: s_mul_hi_u32 s7, s8, s7 ; GFX9-NEXT: s_add_i32 s8, s8, s7 -; GFX9-NEXT: s_mul_hi_u32 s7, s1, s8 -; GFX9-NEXT: s_mul_i32 s8, s7, s6 -; GFX9-NEXT: s_sub_i32 s1, s1, s8 +; GFX9-NEXT: s_mul_hi_u32 s7, s4, s8 +; GFX9-NEXT: s_mul_i32 s8, s7, s9 +; GFX9-NEXT: s_sub_i32 s4, s4, s8 ; GFX9-NEXT: s_add_i32 s10, s7, 1 -; GFX9-NEXT: s_sub_i32 s8, s1, s6 -; GFX9-NEXT: s_cmp_ge_u32 s1, s6 +; GFX9-NEXT: s_sub_i32 s8, s4, s9 +; GFX9-NEXT: s_cmp_ge_u32 s4, s9 ; GFX9-NEXT: s_cselect_b32 s7, s10, s7 -; GFX9-NEXT: s_cselect_b32 s1, s8, s1 +; GFX9-NEXT: s_cselect_b32 s4, s8, s4 ; GFX9-NEXT: s_add_i32 s8, s7, 1 -; GFX9-NEXT: s_cmp_ge_u32 s1, s6 -; GFX9-NEXT: s_cselect_b32 s6, s8, s7 -; GFX9-NEXT: s_ashr_i32 s7, s4, 31 -; GFX9-NEXT: s_add_i32 s4, s4, s7 -; GFX9-NEXT: s_xor_b32 s4, s4, s7 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: v_readfirstlane_b32 s5, v1 -; GFX9-NEXT: s_ashr_i32 s8, s5, 31 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s6, s6, s9 -; GFX9-NEXT: s_add_i32 s5, s5, s8 -; GFX9-NEXT: s_xor_b32 s7, s8, s7 -; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s6, s6, s9 -; GFX9-NEXT: s_xor_b32 s5, s5, s8 -; GFX9-NEXT: s_sub_i32 s8, 0, s4 -; GFX9-NEXT: v_readfirstlane_b32 s9, v0 -; GFX9-NEXT: s_mul_i32 s8, s8, s9 -; GFX9-NEXT: s_mul_hi_u32 s8, s9, s8 -; GFX9-NEXT: s_add_i32 s9, s9, s8 -; GFX9-NEXT: s_mul_hi_u32 s8, s5, s9 -; GFX9-NEXT: s_mul_i32 s9, s8, s4 -; GFX9-NEXT: s_sub_i32 s5, s5, s9 -; GFX9-NEXT: s_add_i32 s10, s8, 1 -; GFX9-NEXT: s_sub_i32 s9, s5, s4 -; GFX9-NEXT: s_cmp_ge_u32 s5, s4 -; GFX9-NEXT: s_cselect_b32 s8, s10, s8 -; GFX9-NEXT: s_cselect_b32 s5, s9, s5 -; GFX9-NEXT: s_add_i32 s9, s8, 1 -; GFX9-NEXT: s_cmp_ge_u32 s5, s4 -; GFX9-NEXT: s_cselect_b32 s4, s9, s8 -; GFX9-NEXT: s_xor_b32 s4, s4, s7 -; GFX9-NEXT: s_sub_i32 s4, s4, s7 +; GFX9-NEXT: s_cmp_ge_u32 s4, s9 +; GFX9-NEXT: s_cselect_b32 s4, s8, s7 +; GFX9-NEXT: s_xor_b32 s4, s4, s5 +; GFX9-NEXT: s_sub_i32 s4, s4, s5 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -819,119 +804,115 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_mov_b32 s8, s0 ; GCN-NEXT: s_mov_b32 s9, s1 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v0 +; GCN-NEXT: v_sub_i32_e32 v12, vcc, 0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v4 -; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v5 -; GCN-NEXT: v_ashrrev_i32_e32 v10, 31, v1 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v11 -; GCN-NEXT: v_ashrrev_i32_e32 v13, 31, v6 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v10 -; GCN-NEXT: v_xor_b32_e32 v4, v4, v9 -; GCN-NEXT: v_xor_b32_e32 v5, v5, v11 -; GCN-NEXT: v_ashrrev_i32_e32 v12, 31, v2 -; GCN-NEXT: v_xor_b32_e32 v15, v8, v9 -; GCN-NEXT: v_xor_b32_e32 v16, v10, v11 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v13 -; GCN-NEXT: v_xor_b32_e32 v0, v0, v8 -; GCN-NEXT: v_xor_b32_e32 v1, v1, v10 -; GCN-NEXT: v_cvt_f32_u32_e32 v8, v4 -; GCN-NEXT: v_cvt_f32_u32_e32 v10, v5 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v12 -; GCN-NEXT: v_xor_b32_e32 v6, v6, v13 -; GCN-NEXT: v_xor_b32_e32 v17, v12, v13 -; GCN-NEXT: v_xor_b32_e32 v2, v2, v12 -; GCN-NEXT: v_cvt_f32_u32_e32 v12, v6 -; GCN-NEXT: v_rcp_iflag_f32_e32 v8, v8 -; GCN-NEXT: v_rcp_iflag_f32_e32 v10, v10 -; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v4 +; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v4 +; GCN-NEXT: v_sub_i32_e32 v13, vcc, 0, v5 +; GCN-NEXT: v_sub_i32_e32 v16, vcc, 0, v6 +; GCN-NEXT: v_xor_b32_e32 v8, v0, v4 +; GCN-NEXT: v_xor_b32_e32 v11, v1, v5 +; GCN-NEXT: v_xor_b32_e32 v14, v2, v6 +; GCN-NEXT: v_max_i32_e32 v4, v4, v10 +; GCN-NEXT: v_max_i32_e32 v5, v5, v13 +; GCN-NEXT: v_max_i32_e32 v6, v6, v16 +; GCN-NEXT: v_max_i32_e32 v1, v1, v12 +; GCN-NEXT: v_ashrrev_i32_e32 v10, 31, v14 +; GCN-NEXT: v_cvt_f32_u32_e32 v12, v4 +; GCN-NEXT: v_cvt_f32_u32_e32 v14, v5 +; GCN-NEXT: v_cvt_f32_u32_e32 v16, v6 +; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v0 ; GCN-NEXT: v_rcp_iflag_f32_e32 v12, v12 -; GCN-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 -; GCN-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10 -; GCN-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_u32_f32_e32 v10, v10 +; GCN-NEXT: v_rcp_iflag_f32_e32 v14, v14 +; GCN-NEXT: v_rcp_iflag_f32_e32 v16, v16 +; GCN-NEXT: v_sub_i32_e32 v15, vcc, 0, v2 ; GCN-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 +; GCN-NEXT: v_mul_f32_e32 v14, 0x4f7ffffe, v14 +; GCN-NEXT: v_mul_f32_e32 v16, 0x4f7ffffe, v16 ; GCN-NEXT: v_cvt_u32_f32_e32 v12, v12 -; GCN-NEXT: v_sub_i32_e32 v11, vcc, 0, v5 -; GCN-NEXT: v_mul_lo_u32 v9, v9, v8 -; GCN-NEXT: v_mul_lo_u32 v11, v11, v10 -; GCN-NEXT: v_sub_i32_e32 v13, vcc, 0, v6 +; GCN-NEXT: v_cvt_u32_f32_e32 v14, v14 +; GCN-NEXT: v_cvt_u32_f32_e32 v16, v16 +; GCN-NEXT: v_sub_i32_e32 v17, vcc, 0, v7 +; GCN-NEXT: v_max_i32_e32 v0, v0, v9 +; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v11 +; GCN-NEXT: v_max_i32_e32 v2, v2, v15 +; GCN-NEXT: v_max_i32_e32 v11, v7, v17 +; GCN-NEXT: v_sub_i32_e32 v13, vcc, 0, v4 +; GCN-NEXT: v_sub_i32_e32 v15, vcc, 0, v5 +; GCN-NEXT: v_sub_i32_e32 v17, vcc, 0, v6 ; GCN-NEXT: v_mul_lo_u32 v13, v13, v12 -; GCN-NEXT: v_mul_hi_u32 v9, v8, v9 -; GCN-NEXT: v_mul_hi_u32 v11, v10, v11 -; GCN-NEXT: v_ashrrev_i32_e32 v14, 31, v7 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v14 +; GCN-NEXT: v_mul_lo_u32 v15, v15, v14 +; GCN-NEXT: v_mul_lo_u32 v17, v17, v16 +; GCN-NEXT: v_cvt_f32_u32_e32 v18, v11 ; GCN-NEXT: v_mul_hi_u32 v13, v12, v13 -; GCN-NEXT: v_xor_b32_e32 v7, v7, v14 -; GCN-NEXT: v_cvt_f32_u32_e32 v18, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; GCN-NEXT: v_add_i32_e32 v9, vcc, v10, v11 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v8 -; GCN-NEXT: v_mul_hi_u32 v9, v1, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, v12, v13 -; GCN-NEXT: v_mul_hi_u32 v10, v2, v10 +; GCN-NEXT: v_mul_hi_u32 v15, v14, v15 +; GCN-NEXT: v_mul_hi_u32 v17, v16, v17 ; GCN-NEXT: v_rcp_iflag_f32_e32 v18, v18 -; GCN-NEXT: v_mul_lo_u32 v11, v8, v4 -; GCN-NEXT: v_mul_lo_u32 v13, v9, v5 -; GCN-NEXT: v_mul_lo_u32 v21, v10, v6 +; GCN-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GCN-NEXT: v_add_i32_e32 v13, vcc, v14, v15 +; GCN-NEXT: v_add_i32_e32 v14, vcc, v16, v17 +; GCN-NEXT: v_mul_hi_u32 v12, v0, v12 +; GCN-NEXT: v_mul_hi_u32 v13, v1, v13 +; GCN-NEXT: v_mul_hi_u32 v14, v2, v14 ; GCN-NEXT: v_mul_f32_e32 v18, 0x4f7ffffe, v18 +; GCN-NEXT: v_mul_lo_u32 v15, v12, v4 +; GCN-NEXT: v_mul_lo_u32 v17, v13, v5 +; GCN-NEXT: v_mul_lo_u32 v21, v14, v6 ; GCN-NEXT: v_cvt_u32_f32_e32 v18, v18 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v11 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 1, v8 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 1, v9 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v15 +; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v17 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v21 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 1, v12 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 1, v13 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 1, v14 ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 ; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v5 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v21 -; GCN-NEXT: v_sub_i32_e32 v11, vcc, v0, v4 -; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v12, s[0:1] -; GCN-NEXT: v_sub_i32_e32 v12, vcc, v1, v5 -; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v20, s[2:3] -; GCN-NEXT: v_sub_i32_e32 v19, vcc, 0, v7 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 1, v10 -; GCN-NEXT: v_sub_i32_e32 v13, vcc, v2, v6 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v11, s[0:1] -; GCN-NEXT: v_add_i32_e32 v11, vcc, 1, v8 -; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3] -; GCN-NEXT: v_add_i32_e32 v12, vcc, 1, v9 +; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 +; GCN-NEXT: v_sub_i32_e32 v19, vcc, 0, v11 +; GCN-NEXT: v_sub_i32_e32 v17, vcc, v0, v4 +; GCN-NEXT: v_cndmask_b32_e64 v12, v12, v16, s[0:1] +; GCN-NEXT: v_sub_i32_e32 v16, vcc, v1, v5 +; GCN-NEXT: v_cndmask_b32_e64 v13, v13, v20, s[2:3] +; GCN-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[4:5] +; GCN-NEXT: v_mul_lo_u32 v19, v19, v18 +; GCN-NEXT: v_sub_i32_e32 v20, vcc, v2, v6 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v17, s[0:1] +; GCN-NEXT: v_add_i32_e32 v15, vcc, 1, v12 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v16, s[2:3] +; GCN-NEXT: v_add_i32_e32 v16, vcc, 1, v13 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 1, v14 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v19, v18 -; GCN-NEXT: v_cndmask_b32_e32 v0, v8, v11, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v12, v15, vcc ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v3 -; GCN-NEXT: v_mul_hi_u32 v4, v18, v4 -; GCN-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; GCN-NEXT: v_xor_b32_e32 v3, v3, v8 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v18, v4 -; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 -; GCN-NEXT: v_mul_hi_u32 v4, v3, v4 -; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v22, s[4:5] -; GCN-NEXT: v_xor_b32_e32 v0, v0, v15 -; GCN-NEXT: v_xor_b32_e32 v1, v1, v16 -; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v13, s[4:5] -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v15 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v16 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v10 +; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v8 +; GCN-NEXT: v_cndmask_b32_e32 v1, v13, v16, vcc +; GCN-NEXT: v_xor_b32_e32 v0, v0, v8 +; GCN-NEXT: v_xor_b32_e32 v1, v1, v9 +; GCN-NEXT: v_mul_hi_u32 v4, v18, v19 +; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v20, s[4:5] +; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 +; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v9 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; GCN-NEXT: v_cndmask_b32_e32 v2, v10, v5, vcc -; GCN-NEXT: v_mul_lo_u32 v5, v4, v7 -; GCN-NEXT: v_xor_b32_e32 v2, v2, v17 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v17 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_xor_b32_e32 v6, v8, v14 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v4 -; GCN-NEXT: v_sub_i32_e32 v8, vcc, v3, v7 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7 -; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v4 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7 -; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GCN-NEXT: v_xor_b32_e32 v3, v3, v6 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v6 +; GCN-NEXT: v_cndmask_b32_e32 v2, v14, v17, vcc +; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0, v3 +; GCN-NEXT: v_max_i32_e32 v5, v3, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v18, v4 +; GCN-NEXT: v_mul_hi_u32 v4, v5, v4 +; GCN-NEXT: v_xor_b32_e32 v2, v2, v10 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 +; GCN-NEXT: v_mul_lo_u32 v6, v4, v11 +; GCN-NEXT: v_xor_b32_e32 v3, v3, v7 +; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v3 +; GCN-NEXT: v_sub_i32_e32 v5, vcc, v5, v6 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v4 +; GCN-NEXT: v_sub_i32_e32 v7, vcc, v5, v11 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v5, v11 +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v4 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v5, v11 +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GCN-NEXT: v_xor_b32_e32 v4, v4, v3 +; GCN-NEXT: v_sub_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NEXT: s_endpgm ; @@ -950,119 +931,115 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: s_mov_b32 s8, s0 ; TONGA-NEXT: s_mov_b32 s9, s1 ; TONGA-NEXT: s_waitcnt vmcnt(1) -; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v0 +; TONGA-NEXT: v_sub_u32_e32 v12, vcc, 0, v1 ; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_ashrrev_i32_e32 v9, 31, v4 -; TONGA-NEXT: v_ashrrev_i32_e32 v11, 31, v5 -; TONGA-NEXT: v_ashrrev_i32_e32 v10, 31, v1 -; TONGA-NEXT: v_add_u32_e32 v4, vcc, v4, v9 -; TONGA-NEXT: v_add_u32_e32 v5, vcc, v5, v11 -; TONGA-NEXT: v_ashrrev_i32_e32 v13, 31, v6 -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v8 -; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v10 -; TONGA-NEXT: v_xor_b32_e32 v4, v4, v9 -; TONGA-NEXT: v_xor_b32_e32 v5, v5, v11 -; TONGA-NEXT: v_ashrrev_i32_e32 v12, 31, v2 -; TONGA-NEXT: v_xor_b32_e32 v15, v8, v9 -; TONGA-NEXT: v_xor_b32_e32 v16, v10, v11 -; TONGA-NEXT: v_add_u32_e32 v6, vcc, v6, v13 -; TONGA-NEXT: v_xor_b32_e32 v0, v0, v8 -; TONGA-NEXT: v_xor_b32_e32 v1, v1, v10 -; TONGA-NEXT: v_cvt_f32_u32_e32 v8, v4 -; TONGA-NEXT: v_cvt_f32_u32_e32 v10, v5 -; TONGA-NEXT: v_add_u32_e32 v2, vcc, v2, v12 -; TONGA-NEXT: v_xor_b32_e32 v6, v6, v13 -; TONGA-NEXT: v_xor_b32_e32 v17, v12, v13 -; TONGA-NEXT: v_xor_b32_e32 v2, v2, v12 -; TONGA-NEXT: v_cvt_f32_u32_e32 v12, v6 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v8, v8 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v10, v10 -; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v4 +; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v4 +; TONGA-NEXT: v_sub_u32_e32 v13, vcc, 0, v5 +; TONGA-NEXT: v_sub_u32_e32 v16, vcc, 0, v6 +; TONGA-NEXT: v_xor_b32_e32 v8, v0, v4 +; TONGA-NEXT: v_xor_b32_e32 v11, v1, v5 +; TONGA-NEXT: v_xor_b32_e32 v14, v2, v6 +; TONGA-NEXT: v_max_i32_e32 v4, v4, v10 +; TONGA-NEXT: v_max_i32_e32 v5, v5, v13 +; TONGA-NEXT: v_max_i32_e32 v6, v6, v16 +; TONGA-NEXT: v_max_i32_e32 v1, v1, v12 +; TONGA-NEXT: v_ashrrev_i32_e32 v10, 31, v14 +; TONGA-NEXT: v_cvt_f32_u32_e32 v12, v4 +; TONGA-NEXT: v_cvt_f32_u32_e32 v14, v5 +; TONGA-NEXT: v_cvt_f32_u32_e32 v16, v6 +; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v0 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v12, v12 -; TONGA-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 -; TONGA-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10 -; TONGA-NEXT: v_cvt_u32_f32_e32 v8, v8 -; TONGA-NEXT: v_cvt_u32_f32_e32 v10, v10 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v14, v14 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v16, v16 +; TONGA-NEXT: v_sub_u32_e32 v15, vcc, 0, v2 ; TONGA-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 +; TONGA-NEXT: v_mul_f32_e32 v14, 0x4f7ffffe, v14 +; TONGA-NEXT: v_mul_f32_e32 v16, 0x4f7ffffe, v16 ; TONGA-NEXT: v_cvt_u32_f32_e32 v12, v12 -; TONGA-NEXT: v_sub_u32_e32 v11, vcc, 0, v5 -; TONGA-NEXT: v_mul_lo_u32 v9, v9, v8 -; TONGA-NEXT: v_mul_lo_u32 v11, v11, v10 -; TONGA-NEXT: v_sub_u32_e32 v13, vcc, 0, v6 +; TONGA-NEXT: v_cvt_u32_f32_e32 v14, v14 +; TONGA-NEXT: v_cvt_u32_f32_e32 v16, v16 +; TONGA-NEXT: v_sub_u32_e32 v17, vcc, 0, v7 +; TONGA-NEXT: v_max_i32_e32 v0, v0, v9 +; TONGA-NEXT: v_ashrrev_i32_e32 v9, 31, v11 +; TONGA-NEXT: v_max_i32_e32 v2, v2, v15 +; TONGA-NEXT: v_max_i32_e32 v11, v7, v17 +; TONGA-NEXT: v_sub_u32_e32 v13, vcc, 0, v4 +; TONGA-NEXT: v_sub_u32_e32 v15, vcc, 0, v5 +; TONGA-NEXT: v_sub_u32_e32 v17, vcc, 0, v6 ; TONGA-NEXT: v_mul_lo_u32 v13, v13, v12 -; TONGA-NEXT: v_mul_hi_u32 v9, v8, v9 -; TONGA-NEXT: v_mul_hi_u32 v11, v10, v11 -; TONGA-NEXT: v_ashrrev_i32_e32 v14, 31, v7 -; TONGA-NEXT: v_add_u32_e32 v7, vcc, v7, v14 +; TONGA-NEXT: v_mul_lo_u32 v15, v15, v14 +; TONGA-NEXT: v_mul_lo_u32 v17, v17, v16 +; TONGA-NEXT: v_cvt_f32_u32_e32 v18, v11 ; TONGA-NEXT: v_mul_hi_u32 v13, v12, v13 -; TONGA-NEXT: v_xor_b32_e32 v7, v7, v14 -; TONGA-NEXT: v_cvt_f32_u32_e32 v18, v7 -; TONGA-NEXT: v_add_u32_e32 v8, vcc, v8, v9 -; TONGA-NEXT: v_add_u32_e32 v9, vcc, v10, v11 -; TONGA-NEXT: v_mul_hi_u32 v8, v0, v8 -; TONGA-NEXT: v_mul_hi_u32 v9, v1, v9 -; TONGA-NEXT: v_add_u32_e32 v10, vcc, v12, v13 -; TONGA-NEXT: v_mul_hi_u32 v10, v2, v10 +; TONGA-NEXT: v_mul_hi_u32 v15, v14, v15 +; TONGA-NEXT: v_mul_hi_u32 v17, v16, v17 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v18, v18 -; TONGA-NEXT: v_mul_lo_u32 v11, v8, v4 -; TONGA-NEXT: v_mul_lo_u32 v13, v9, v5 -; TONGA-NEXT: v_mul_lo_u32 v21, v10, v6 +; TONGA-NEXT: v_add_u32_e32 v12, vcc, v12, v13 +; TONGA-NEXT: v_add_u32_e32 v13, vcc, v14, v15 +; TONGA-NEXT: v_add_u32_e32 v14, vcc, v16, v17 +; TONGA-NEXT: v_mul_hi_u32 v12, v0, v12 +; TONGA-NEXT: v_mul_hi_u32 v13, v1, v13 +; TONGA-NEXT: v_mul_hi_u32 v14, v2, v14 ; TONGA-NEXT: v_mul_f32_e32 v18, 0x4f7ffffe, v18 +; TONGA-NEXT: v_mul_lo_u32 v15, v12, v4 +; TONGA-NEXT: v_mul_lo_u32 v17, v13, v5 +; TONGA-NEXT: v_mul_lo_u32 v21, v14, v6 ; TONGA-NEXT: v_cvt_u32_f32_e32 v18, v18 -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v11 -; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v13 -; TONGA-NEXT: v_add_u32_e32 v12, vcc, 1, v8 -; TONGA-NEXT: v_add_u32_e32 v20, vcc, 1, v9 +; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v15 +; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v17 +; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v21 +; TONGA-NEXT: v_add_u32_e32 v16, vcc, 1, v12 +; TONGA-NEXT: v_add_u32_e32 v20, vcc, 1, v13 +; TONGA-NEXT: v_add_u32_e32 v15, vcc, 1, v14 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v5 -; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v21 -; TONGA-NEXT: v_sub_u32_e32 v11, vcc, v0, v4 -; TONGA-NEXT: v_cndmask_b32_e64 v8, v8, v12, s[0:1] -; TONGA-NEXT: v_sub_u32_e32 v12, vcc, v1, v5 -; TONGA-NEXT: v_cndmask_b32_e64 v9, v9, v20, s[2:3] -; TONGA-NEXT: v_sub_u32_e32 v19, vcc, 0, v7 -; TONGA-NEXT: v_add_u32_e32 v22, vcc, 1, v10 -; TONGA-NEXT: v_sub_u32_e32 v13, vcc, v2, v6 -; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v11, s[0:1] -; TONGA-NEXT: v_add_u32_e32 v11, vcc, 1, v8 -; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3] -; TONGA-NEXT: v_add_u32_e32 v12, vcc, 1, v9 +; TONGA-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 +; TONGA-NEXT: v_sub_u32_e32 v19, vcc, 0, v11 +; TONGA-NEXT: v_sub_u32_e32 v17, vcc, v0, v4 +; TONGA-NEXT: v_cndmask_b32_e64 v12, v12, v16, s[0:1] +; TONGA-NEXT: v_sub_u32_e32 v16, vcc, v1, v5 +; TONGA-NEXT: v_cndmask_b32_e64 v13, v13, v20, s[2:3] +; TONGA-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[4:5] +; TONGA-NEXT: v_mul_lo_u32 v19, v19, v18 +; TONGA-NEXT: v_sub_u32_e32 v20, vcc, v2, v6 +; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v17, s[0:1] +; TONGA-NEXT: v_add_u32_e32 v15, vcc, 1, v12 +; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v16, s[2:3] +; TONGA-NEXT: v_add_u32_e32 v16, vcc, 1, v13 +; TONGA-NEXT: v_add_u32_e32 v17, vcc, 1, v14 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; TONGA-NEXT: v_mul_lo_u32 v4, v19, v18 -; TONGA-NEXT: v_cndmask_b32_e32 v0, v8, v11, vcc +; TONGA-NEXT: v_cndmask_b32_e32 v0, v12, v15, vcc ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v3 -; TONGA-NEXT: v_mul_hi_u32 v4, v18, v4 -; TONGA-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc -; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v8 -; TONGA-NEXT: v_xor_b32_e32 v3, v3, v8 -; TONGA-NEXT: v_add_u32_e32 v4, vcc, v18, v4 -; TONGA-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 -; TONGA-NEXT: v_mul_hi_u32 v4, v3, v4 -; TONGA-NEXT: v_cndmask_b32_e64 v10, v10, v22, s[4:5] -; TONGA-NEXT: v_xor_b32_e32 v0, v0, v15 -; TONGA-NEXT: v_xor_b32_e32 v1, v1, v16 -; TONGA-NEXT: v_cndmask_b32_e64 v2, v2, v13, s[4:5] -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v15 -; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v16 -; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v10 +; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v8 +; TONGA-NEXT: v_cndmask_b32_e32 v1, v13, v16, vcc +; TONGA-NEXT: v_xor_b32_e32 v0, v0, v8 +; TONGA-NEXT: v_xor_b32_e32 v1, v1, v9 +; TONGA-NEXT: v_mul_hi_u32 v4, v18, v19 +; TONGA-NEXT: v_cndmask_b32_e64 v2, v2, v20, s[4:5] +; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v8 +; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v9 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; TONGA-NEXT: v_cndmask_b32_e32 v2, v10, v5, vcc -; TONGA-NEXT: v_mul_lo_u32 v5, v4, v7 -; TONGA-NEXT: v_xor_b32_e32 v2, v2, v17 -; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v17 -; TONGA-NEXT: v_sub_u32_e32 v3, vcc, v3, v5 -; TONGA-NEXT: v_xor_b32_e32 v6, v8, v14 -; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v4 -; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v3, v7 -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7 -; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; TONGA-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc -; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v4 -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7 -; TONGA-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; TONGA-NEXT: v_xor_b32_e32 v3, v3, v6 -; TONGA-NEXT: v_sub_u32_e32 v3, vcc, v3, v6 +; TONGA-NEXT: v_cndmask_b32_e32 v2, v14, v17, vcc +; TONGA-NEXT: v_sub_u32_e32 v5, vcc, 0, v3 +; TONGA-NEXT: v_max_i32_e32 v5, v3, v5 +; TONGA-NEXT: v_add_u32_e32 v4, vcc, v18, v4 +; TONGA-NEXT: v_mul_hi_u32 v4, v5, v4 +; TONGA-NEXT: v_xor_b32_e32 v2, v2, v10 +; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v10 +; TONGA-NEXT: v_mul_lo_u32 v6, v4, v11 +; TONGA-NEXT: v_xor_b32_e32 v3, v3, v7 +; TONGA-NEXT: v_ashrrev_i32_e32 v3, 31, v3 +; TONGA-NEXT: v_sub_u32_e32 v5, vcc, v5, v6 +; TONGA-NEXT: v_add_u32_e32 v6, vcc, 1, v4 +; TONGA-NEXT: v_sub_u32_e32 v7, vcc, v5, v11 +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v5, v11 +; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; TONGA-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; TONGA-NEXT: v_add_u32_e32 v6, vcc, 1, v4 +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v5, v11 +; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; TONGA-NEXT: v_xor_b32_e32 v4, v4, v3 +; TONGA-NEXT: v_sub_u32_e32 v3, vcc, v4, v3 ; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; TONGA-NEXT: s_endpgm ; @@ -1078,138 +1055,126 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_mov_b32 s9, s7 ; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 ; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_readfirstlane_b32 s1, v0 -; GFX9-NEXT: s_ashr_i32 s4, s1, 31 -; GFX9-NEXT: s_add_i32 s1, s1, s4 -; GFX9-NEXT: s_xor_b32 s6, s1, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_abs_i32 s1, s0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s8, v4 -; GFX9-NEXT: s_ashr_i32 s9, s8, 31 -; GFX9-NEXT: s_add_i32 s8, s8, s9 +; GFX9-NEXT: v_readfirstlane_b32 s7, v4 +; GFX9-NEXT: s_xor_b32 s0, s7, s0 +; GFX9-NEXT: s_ashr_i32 s8, s0, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s4, s9, s4 -; GFX9-NEXT: s_xor_b32 s8, s8, s9 -; GFX9-NEXT: s_sub_i32 s9, 0, s6 +; GFX9-NEXT: s_sub_i32 s0, 0, s1 +; GFX9-NEXT: s_abs_i32 s7, s7 +; GFX9-NEXT: v_readfirstlane_b32 s6, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s7, v1 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: v_readfirstlane_b32 s5, v3 -; GFX9-NEXT: v_readfirstlane_b32 s10, v0 -; GFX9-NEXT: s_mul_i32 s9, s9, s10 -; GFX9-NEXT: s_mul_hi_u32 s9, s10, s9 -; GFX9-NEXT: s_add_i32 s10, s10, s9 -; GFX9-NEXT: s_mul_hi_u32 s9, s8, s10 -; GFX9-NEXT: s_mul_i32 s10, s9, s6 -; GFX9-NEXT: s_sub_i32 s8, s8, s10 -; GFX9-NEXT: s_add_i32 s11, s9, 1 -; GFX9-NEXT: s_sub_i32 s10, s8, s6 -; GFX9-NEXT: s_cmp_ge_u32 s8, s6 -; GFX9-NEXT: s_cselect_b32 s9, s11, s9 -; GFX9-NEXT: s_cselect_b32 s8, s10, s8 -; GFX9-NEXT: s_add_i32 s10, s9, 1 -; GFX9-NEXT: s_cmp_ge_u32 s8, s6 -; GFX9-NEXT: s_cselect_b32 s6, s10, s9 -; GFX9-NEXT: s_ashr_i32 s8, s7, 31 -; GFX9-NEXT: s_add_i32 s7, s7, s8 -; GFX9-NEXT: s_xor_b32 s7, s7, s8 +; GFX9-NEXT: v_readfirstlane_b32 s9, v0 +; GFX9-NEXT: s_mul_i32 s0, s0, s9 +; GFX9-NEXT: s_mul_hi_u32 s0, s9, s0 +; GFX9-NEXT: s_add_i32 s9, s9, s0 +; GFX9-NEXT: s_mul_hi_u32 s0, s7, s9 +; GFX9-NEXT: s_mul_i32 s9, s0, s1 +; GFX9-NEXT: s_sub_i32 s7, s7, s9 +; GFX9-NEXT: s_add_i32 s10, s0, 1 +; GFX9-NEXT: s_sub_i32 s9, s7, s1 +; GFX9-NEXT: s_cmp_ge_u32 s7, s1 +; GFX9-NEXT: s_cselect_b32 s0, s10, s0 +; GFX9-NEXT: s_cselect_b32 s7, s9, s7 +; GFX9-NEXT: s_add_i32 s9, s0, 1 +; GFX9-NEXT: s_cmp_ge_u32 s7, s1 +; GFX9-NEXT: s_cselect_b32 s1, s9, s0 +; GFX9-NEXT: s_abs_i32 s7, s6 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: v_readfirstlane_b32 s10, v5 -; GFX9-NEXT: s_ashr_i32 s11, s10, 31 -; GFX9-NEXT: s_xor_b32 s6, s6, s4 +; GFX9-NEXT: s_xor_b32 s1, s1, s8 +; GFX9-NEXT: s_sub_i32 s10, 0, s7 +; GFX9-NEXT: s_sub_i32 s8, s1, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_add_i32 s10, s10, s11 -; GFX9-NEXT: s_xor_b32 s8, s11, s8 -; GFX9-NEXT: s_sub_i32 s4, s6, s4 +; GFX9-NEXT: v_readfirstlane_b32 s9, v5 +; GFX9-NEXT: s_xor_b32 s6, s9, s6 +; GFX9-NEXT: s_abs_i32 s9, s9 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s6, s10, s11 -; GFX9-NEXT: s_sub_i32 s10, 0, s7 -; GFX9-NEXT: v_readfirstlane_b32 s9, v2 -; GFX9-NEXT: v_readfirstlane_b32 s11, v0 -; GFX9-NEXT: s_mul_i32 s10, s10, s11 -; GFX9-NEXT: s_mul_hi_u32 s10, s11, s10 -; GFX9-NEXT: s_add_i32 s11, s11, s10 -; GFX9-NEXT: s_mul_hi_u32 s10, s6, s11 -; GFX9-NEXT: s_mul_i32 s11, s10, s7 -; GFX9-NEXT: s_sub_i32 s6, s6, s11 -; GFX9-NEXT: s_add_i32 s12, s10, 1 -; GFX9-NEXT: s_sub_i32 s11, s6, s7 -; GFX9-NEXT: s_cmp_ge_u32 s6, s7 -; GFX9-NEXT: s_cselect_b32 s10, s12, s10 -; GFX9-NEXT: s_cselect_b32 s6, s11, s6 -; GFX9-NEXT: s_add_i32 s11, s10, 1 -; GFX9-NEXT: s_cmp_ge_u32 s6, s7 -; GFX9-NEXT: s_cselect_b32 s6, s11, s10 -; GFX9-NEXT: s_ashr_i32 s7, s9, 31 -; GFX9-NEXT: s_add_i32 s9, s9, s7 -; GFX9-NEXT: s_xor_b32 s9, s9, s7 +; GFX9-NEXT: s_ashr_i32 s6, s6, 31 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: v_readfirstlane_b32 s4, v2 +; GFX9-NEXT: v_readfirstlane_b32 s1, v0 +; GFX9-NEXT: s_mul_i32 s10, s10, s1 +; GFX9-NEXT: s_mul_hi_u32 s10, s1, s10 +; GFX9-NEXT: s_add_i32 s1, s1, s10 +; GFX9-NEXT: s_mul_hi_u32 s1, s9, s1 +; GFX9-NEXT: s_mul_i32 s10, s1, s7 +; GFX9-NEXT: s_sub_i32 s9, s9, s10 +; GFX9-NEXT: s_add_i32 s11, s1, 1 +; GFX9-NEXT: s_sub_i32 s10, s9, s7 +; GFX9-NEXT: s_cmp_ge_u32 s9, s7 +; GFX9-NEXT: s_cselect_b32 s1, s11, s1 +; GFX9-NEXT: s_cselect_b32 s9, s10, s9 +; GFX9-NEXT: s_add_i32 s10, s1, 1 +; GFX9-NEXT: s_cmp_ge_u32 s9, s7 +; GFX9-NEXT: s_cselect_b32 s7, s10, s1 +; GFX9-NEXT: s_abs_i32 s9, s4 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX9-NEXT: v_readfirstlane_b32 s11, v6 -; GFX9-NEXT: s_ashr_i32 s12, s11, 31 -; GFX9-NEXT: s_xor_b32 s6, s6, s8 +; GFX9-NEXT: s_xor_b32 s7, s7, s6 +; GFX9-NEXT: s_sub_i32 s11, 0, s9 +; GFX9-NEXT: s_sub_i32 s6, s7, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_add_i32 s11, s11, s12 -; GFX9-NEXT: s_xor_b32 s7, s12, s7 -; GFX9-NEXT: s_sub_i32 s6, s6, s8 +; GFX9-NEXT: v_readfirstlane_b32 s10, v6 +; GFX9-NEXT: s_xor_b32 s4, s10, s4 +; GFX9-NEXT: s_abs_i32 s10, s10 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s8, s11, s12 -; GFX9-NEXT: s_sub_i32 s11, 0, s9 +; GFX9-NEXT: s_ashr_i32 s4, s4, 31 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: v_readfirstlane_b32 s5, v3 +; GFX9-NEXT: v_readfirstlane_b32 s7, v0 +; GFX9-NEXT: s_mul_i32 s11, s11, s7 +; GFX9-NEXT: s_mul_hi_u32 s11, s7, s11 +; GFX9-NEXT: s_add_i32 s7, s7, s11 +; GFX9-NEXT: s_mul_hi_u32 s7, s10, s7 +; GFX9-NEXT: s_mul_i32 s11, s7, s9 +; GFX9-NEXT: s_sub_i32 s10, s10, s11 +; GFX9-NEXT: s_add_i32 s12, s7, 1 +; GFX9-NEXT: s_sub_i32 s11, s10, s9 +; GFX9-NEXT: s_cmp_ge_u32 s10, s9 +; GFX9-NEXT: s_cselect_b32 s7, s12, s7 +; GFX9-NEXT: s_cselect_b32 s10, s11, s10 +; GFX9-NEXT: s_add_i32 s11, s7, 1 +; GFX9-NEXT: s_cmp_ge_u32 s10, s9 +; GFX9-NEXT: s_cselect_b32 s7, s11, s7 +; GFX9-NEXT: s_abs_i32 s9, s5 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s9 +; GFX9-NEXT: s_xor_b32 s7, s7, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: s_sub_i32 s8, 0, s9 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX9-NEXT: s_sub_i32 s4, s7, s4 ; GFX9-NEXT: v_readfirstlane_b32 s10, v7 -; GFX9-NEXT: v_readfirstlane_b32 s12, v0 -; GFX9-NEXT: s_mul_i32 s11, s11, s12 -; GFX9-NEXT: s_mul_hi_u32 s11, s12, s11 -; GFX9-NEXT: s_add_i32 s12, s12, s11 -; GFX9-NEXT: s_mul_hi_u32 s11, s8, s12 -; GFX9-NEXT: s_mul_i32 s12, s11, s9 -; GFX9-NEXT: s_sub_i32 s8, s8, s12 -; GFX9-NEXT: s_add_i32 s13, s11, 1 -; GFX9-NEXT: s_sub_i32 s12, s8, s9 -; GFX9-NEXT: s_cmp_ge_u32 s8, s9 -; GFX9-NEXT: s_cselect_b32 s11, s13, s11 -; GFX9-NEXT: s_cselect_b32 s8, s12, s8 -; GFX9-NEXT: s_add_i32 s12, s11, 1 -; GFX9-NEXT: s_cmp_ge_u32 s8, s9 -; GFX9-NEXT: s_cselect_b32 s8, s12, s11 -; GFX9-NEXT: s_ashr_i32 s9, s5, 31 -; GFX9-NEXT: s_add_i32 s5, s5, s9 -; GFX9-NEXT: s_xor_b32 s5, s5, s9 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: s_ashr_i32 s4, s10, 31 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX9-NEXT: s_xor_b32 s6, s8, s7 -; GFX9-NEXT: s_xor_b32 s8, s4, s9 -; GFX9-NEXT: s_sub_i32 s6, s6, s7 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: s_sub_i32 s7, 0, s5 -; GFX9-NEXT: s_add_i32 s10, s10, s4 -; GFX9-NEXT: s_xor_b32 s4, s10, s4 -; GFX9-NEXT: v_readfirstlane_b32 s9, v2 -; GFX9-NEXT: s_mul_i32 s7, s7, s9 -; GFX9-NEXT: s_mul_hi_u32 s7, s9, s7 -; GFX9-NEXT: s_add_i32 s9, s9, s7 -; GFX9-NEXT: s_mul_hi_u32 s7, s4, s9 -; GFX9-NEXT: s_mul_i32 s9, s7, s5 -; GFX9-NEXT: s_sub_i32 s4, s4, s9 +; GFX9-NEXT: s_abs_i32 s6, s10 +; GFX9-NEXT: s_xor_b32 s5, s10, s5 +; GFX9-NEXT: s_ashr_i32 s5, s5, 31 +; GFX9-NEXT: v_readfirstlane_b32 s7, v2 +; GFX9-NEXT: s_mul_i32 s8, s8, s7 +; GFX9-NEXT: s_mul_hi_u32 s8, s7, s8 +; GFX9-NEXT: s_add_i32 s7, s7, s8 +; GFX9-NEXT: s_mul_hi_u32 s7, s6, s7 +; GFX9-NEXT: s_mul_i32 s8, s7, s9 +; GFX9-NEXT: s_sub_i32 s6, s6, s8 ; GFX9-NEXT: s_add_i32 s10, s7, 1 -; GFX9-NEXT: s_sub_i32 s9, s4, s5 -; GFX9-NEXT: s_cmp_ge_u32 s4, s5 +; GFX9-NEXT: s_sub_i32 s8, s6, s9 +; GFX9-NEXT: s_cmp_ge_u32 s6, s9 ; GFX9-NEXT: s_cselect_b32 s7, s10, s7 -; GFX9-NEXT: s_cselect_b32 s4, s9, s4 -; GFX9-NEXT: s_add_i32 s9, s7, 1 -; GFX9-NEXT: s_cmp_ge_u32 s4, s5 -; GFX9-NEXT: s_cselect_b32 s4, s9, s7 -; GFX9-NEXT: s_xor_b32 s4, s4, s8 -; GFX9-NEXT: s_sub_i32 s4, s4, s8 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: s_cselect_b32 s6, s8, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 1 +; GFX9-NEXT: s_cmp_ge_u32 s6, s9 +; GFX9-NEXT: s_cselect_b32 s6, s8, s7 +; GFX9-NEXT: s_xor_b32 s6, s6, s5 +; GFX9-NEXT: s_sub_i32 s5, s6, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -2009,20 +1974,19 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_bfe_i32 v2, v1, 0, 25 -; GCN-NEXT: v_bfe_i32 v1, v1, 24, 1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v1 -; GCN-NEXT: v_xor_b32_e32 v2, v2, v1 +; GCN-NEXT: v_bfe_i32 v1, v1, 0, 25 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, 0, v1 +; GCN-NEXT: v_max_i32_e32 v2, v1, v2 ; GCN-NEXT: v_cvt_f32_u32_e32 v3, v2 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v2 -; GCN-NEXT: v_bfe_i32 v5, v0, 0, 25 +; GCN-NEXT: v_bfe_i32 v0, v0, 0, 25 ; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GCN-NEXT: v_bfe_i32 v0, v0, 24, 1 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v0 +; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0, v0 +; GCN-NEXT: v_max_i32_e32 v5, v0, v5 ; GCN-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GCN-NEXT: v_xor_b32_e32 v5, v5, v0 ; GCN-NEXT: v_xor_b32_e32 v0, v0, v1 +; GCN-NEXT: v_ashrrev_i32_e32 v0, 31, v0 ; GCN-NEXT: v_mul_lo_u32 v4, v4, v3 ; GCN-NEXT: v_mul_hi_u32 v4, v3, v4 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 @@ -2030,7 +1994,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_mul_lo_u32 v1, v3, v2 ; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v3 ; GCN-NEXT: v_sub_i32_e32 v1, vcc, v5, v1 -; GCN-NEXT: v_sub_i32_e32 v5, vcc, v1, v2 +; GCN-NEXT: v_subrev_i32_e32 v5, vcc, v2, v1 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc @@ -2057,20 +2021,19 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: s_mov_b32 s0, s4 ; TONGA-NEXT: s_mov_b32 s1, s5 ; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_bfe_i32 v2, v1, 0, 25 -; TONGA-NEXT: v_bfe_i32 v1, v1, 24, 1 -; TONGA-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; TONGA-NEXT: v_xor_b32_e32 v2, v2, v1 +; TONGA-NEXT: v_bfe_i32 v1, v1, 0, 25 +; TONGA-NEXT: v_sub_u32_e32 v2, vcc, 0, v1 +; TONGA-NEXT: v_max_i32_e32 v2, v1, v2 ; TONGA-NEXT: v_cvt_f32_u32_e32 v3, v2 ; TONGA-NEXT: v_sub_u32_e32 v4, vcc, 0, v2 -; TONGA-NEXT: v_bfe_i32 v5, v0, 0, 25 +; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 25 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; TONGA-NEXT: v_bfe_i32 v0, v0, 24, 1 -; TONGA-NEXT: v_add_u32_e32 v5, vcc, v5, v0 +; TONGA-NEXT: v_sub_u32_e32 v5, vcc, 0, v0 +; TONGA-NEXT: v_max_i32_e32 v5, v0, v5 ; TONGA-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; TONGA-NEXT: v_cvt_u32_f32_e32 v3, v3 -; TONGA-NEXT: v_xor_b32_e32 v5, v5, v0 ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v1 +; TONGA-NEXT: v_ashrrev_i32_e32 v0, 31, v0 ; TONGA-NEXT: v_mul_lo_u32 v4, v4, v3 ; TONGA-NEXT: v_mul_hi_u32 v4, v3, v4 ; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v4 @@ -2078,7 +2041,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_mul_lo_u32 v1, v3, v2 ; TONGA-NEXT: v_add_u32_e32 v4, vcc, 1, v3 ; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v5, v1 -; TONGA-NEXT: v_sub_u32_e32 v5, vcc, v1, v2 +; TONGA-NEXT: v_subrev_u32_e32 v5, vcc, v2, v1 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; TONGA-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; TONGA-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc @@ -2102,42 +2065,39 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_mov_b32 s8, s6 ; GFX9-NEXT: s_mov_b32 s9, s7 ; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 -; GFX9-NEXT: s_bfe_i32 s1, s0, 0x190000 -; GFX9-NEXT: s_bfe_i32 s6, s0, 0x10018 -; GFX9-NEXT: s_add_i32 s1, s1, s6 -; GFX9-NEXT: s_xor_b32 s7, s1, s6 +; GFX9-NEXT: s_bfe_i32 s6, s0, 0x190000 +; GFX9-NEXT: s_abs_i32 s7, s6 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v1 -; GFX9-NEXT: s_bfe_i32 s5, s4, 0x190000 -; GFX9-NEXT: s_bfe_i32 s4, s4, 0x10018 -; GFX9-NEXT: s_add_i32 s5, s5, s4 -; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX9-NEXT: s_bfe_i32 s4, s4, 0x190000 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX9-NEXT: s_xor_b32 s5, s4, s6 +; GFX9-NEXT: s_sub_i32 s6, 0, s7 +; GFX9-NEXT: s_abs_i32 s4, s4 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s6, s4, s6 -; GFX9-NEXT: s_xor_b32 s4, s5, s4 -; GFX9-NEXT: s_sub_i32 s5, 0, s7 +; GFX9-NEXT: s_ashr_i32 s5, s5, 31 ; GFX9-NEXT: v_readfirstlane_b32 s8, v0 -; GFX9-NEXT: s_mul_i32 s5, s5, s8 -; GFX9-NEXT: s_mul_hi_u32 s5, s8, s5 -; GFX9-NEXT: s_add_i32 s8, s8, s5 -; GFX9-NEXT: s_mul_hi_u32 s5, s4, s8 -; GFX9-NEXT: s_mul_i32 s8, s5, s7 +; GFX9-NEXT: s_mul_i32 s6, s6, s8 +; GFX9-NEXT: s_mul_hi_u32 s6, s8, s6 +; GFX9-NEXT: s_add_i32 s8, s8, s6 +; GFX9-NEXT: s_mul_hi_u32 s6, s4, s8 +; GFX9-NEXT: s_mul_i32 s8, s6, s7 ; GFX9-NEXT: s_sub_i32 s4, s4, s8 -; GFX9-NEXT: s_add_i32 s9, s5, 1 +; GFX9-NEXT: s_add_i32 s9, s6, 1 ; GFX9-NEXT: s_sub_i32 s8, s4, s7 ; GFX9-NEXT: s_cmp_ge_u32 s4, s7 -; GFX9-NEXT: s_cselect_b32 s5, s9, s5 +; GFX9-NEXT: s_cselect_b32 s6, s9, s6 ; GFX9-NEXT: s_cselect_b32 s4, s8, s4 -; GFX9-NEXT: s_add_i32 s8, s5, 1 +; GFX9-NEXT: s_add_i32 s8, s6, 1 ; GFX9-NEXT: s_cmp_ge_u32 s4, s7 -; GFX9-NEXT: s_cselect_b32 s4, s8, s5 -; GFX9-NEXT: s_xor_b32 s4, s4, s6 -; GFX9-NEXT: s_sub_i32 s4, s4, s6 +; GFX9-NEXT: s_cselect_b32 s4, s8, s6 +; GFX9-NEXT: s_xor_b32 s4, s4, s5 +; GFX9-NEXT: s_sub_i32 s4, s4, s5 ; GFX9-NEXT: s_bfe_i32 s4, s4, 0x190000 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index b086640c72f80..c310e257adadc 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -147,11 +147,11 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_ashr_i32 s0, s7, 31 ; GCN-IR-NEXT: s_mov_b32 s1, s0 ; GCN-IR-NEXT: s_ashr_i32 s2, s9, 31 -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[0:1], s[6:7] +; GCN-IR-NEXT: s_xor_b64 s[6:7], s[6:7], s[0:1] ; GCN-IR-NEXT: s_mov_b32 s3, s2 ; GCN-IR-NEXT: s_sub_u32 s12, s6, s0 ; GCN-IR-NEXT: s_subb_u32 s13, s7, s0 -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[2:3], s[8:9] +; GCN-IR-NEXT: s_xor_b64 s[6:7], s[8:9], s[2:3] ; GCN-IR-NEXT: s_sub_u32 s6, s6, s2 ; GCN-IR-NEXT: s_subb_u32 s7, s7, s2 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[12:13], 0 @@ -357,13 +357,13 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v1 -; GCN-IR-NEXT: v_xor_b32_e32 v0, v12, v0 +; GCN-IR-NEXT: v_xor_b32_e32 v0, v0, v12 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v13, 31, v3 -; GCN-IR-NEXT: v_xor_b32_e32 v1, v12, v1 +; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v12 ; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v0, v12 ; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, v1, v12, vcc -; GCN-IR-NEXT: v_xor_b32_e32 v0, v13, v2 -; GCN-IR-NEXT: v_xor_b32_e32 v1, v13, v3 +; GCN-IR-NEXT: v_xor_b32_e32 v0, v2, v13 +; GCN-IR-NEXT: v_xor_b32_e32 v1, v3, v13 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v13 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v13, vcc ; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 @@ -587,43 +587,39 @@ define i64 @v_test_sdiv24_64(i64 %x, i64 %y) { define amdgpu_kernel void @s_test_sdiv32_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv32_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s2, s[0:1], 0xe +; GCN-NEXT: s_load_dword s8, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i32 s8, s2, 31 -; GCN-NEXT: s_add_i32 s2, s2, s8 -; GCN-NEXT: s_xor_b32 s9, s2, s8 +; GCN-NEXT: s_abs_i32 s9, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s2, 0, s9 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_abs_i32 s0, s3 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_xor_b32 s1, s3, s8 +; GCN-NEXT: s_ashr_i32 s1, s1, 31 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_mul_lo_u32 v1, s2, v0 -; GCN-NEXT: s_ashr_i32 s2, s3, 31 -; GCN-NEXT: s_add_i32 s3, s3, s2 -; GCN-NEXT: s_xor_b32 s3, s3, s2 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 -; GCN-NEXT: s_xor_b32 s0, s2, s8 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 -; GCN-NEXT: v_readfirstlane_b32 s1, v0 -; GCN-NEXT: s_mul_i32 s2, s1, s9 -; GCN-NEXT: s_sub_i32 s2, s3, s2 -; GCN-NEXT: s_add_i32 s8, s1, 1 -; GCN-NEXT: s_sub_i32 s3, s2, s9 -; GCN-NEXT: s_cmp_ge_u32 s2, s9 -; GCN-NEXT: s_cselect_b32 s1, s8, s1 -; GCN-NEXT: s_cselect_b32 s2, s3, s2 -; GCN-NEXT: s_add_i32 s3, s1, 1 -; GCN-NEXT: s_cmp_ge_u32 s2, s9 -; GCN-NEXT: s_cselect_b32 s1, s3, s1 -; GCN-NEXT: s_xor_b32 s1, s1, s0 -; GCN-NEXT: s_sub_i32 s0, s1, s0 +; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 +; GCN-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-NEXT: s_mul_i32 s3, s2, s9 +; GCN-NEXT: s_sub_i32 s0, s0, s3 +; GCN-NEXT: s_add_i32 s8, s2, 1 +; GCN-NEXT: s_sub_i32 s3, s0, s9 +; GCN-NEXT: s_cmp_ge_u32 s0, s9 +; GCN-NEXT: s_cselect_b32 s2, s8, s2 +; GCN-NEXT: s_cselect_b32 s0, s3, s0 +; GCN-NEXT: s_add_i32 s3, s2, 1 +; GCN-NEXT: s_cmp_ge_u32 s0, s9 +; GCN-NEXT: s_cselect_b32 s0, s3, s2 +; GCN-NEXT: s_xor_b32 s0, s0, s1 +; GCN-NEXT: s_sub_i32 s0, s0, s1 ; GCN-NEXT: s_ashr_i32 s1, s0, 31 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 @@ -632,43 +628,39 @@ define amdgpu_kernel void @s_test_sdiv32_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_sdiv32_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dword s8, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i32 s8, s2, 31 -; GCN-IR-NEXT: s_add_i32 s2, s2, s8 -; GCN-IR-NEXT: s_xor_b32 s9, s2, s8 +; GCN-IR-NEXT: s_abs_i32 s9, s8 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_sub_i32 s2, 0, s9 +; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: s_abs_i32 s0, s3 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-IR-NEXT: s_mov_b32 s5, s1 -; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: s_xor_b32 s1, s3, s8 +; GCN-IR-NEXT: s_ashr_i32 s1, s1, 31 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v1, s2, v0 -; GCN-IR-NEXT: s_ashr_i32 s2, s3, 31 -; GCN-IR-NEXT: s_add_i32 s3, s3, s2 -; GCN-IR-NEXT: s_xor_b32 s3, s3, s2 ; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 -; GCN-IR-NEXT: s_xor_b32 s0, s2, s8 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-IR-NEXT: v_mul_hi_u32 v0, s3, v0 -; GCN-IR-NEXT: v_readfirstlane_b32 s1, v0 -; GCN-IR-NEXT: s_mul_i32 s2, s1, s9 -; GCN-IR-NEXT: s_sub_i32 s2, s3, s2 -; GCN-IR-NEXT: s_add_i32 s8, s1, 1 -; GCN-IR-NEXT: s_sub_i32 s3, s2, s9 -; GCN-IR-NEXT: s_cmp_ge_u32 s2, s9 -; GCN-IR-NEXT: s_cselect_b32 s1, s8, s1 -; GCN-IR-NEXT: s_cselect_b32 s2, s3, s2 -; GCN-IR-NEXT: s_add_i32 s3, s1, 1 -; GCN-IR-NEXT: s_cmp_ge_u32 s2, s9 -; GCN-IR-NEXT: s_cselect_b32 s1, s3, s1 -; GCN-IR-NEXT: s_xor_b32 s1, s1, s0 -; GCN-IR-NEXT: s_sub_i32 s0, s1, s0 +; GCN-IR-NEXT: v_mul_hi_u32 v0, s0, v0 +; GCN-IR-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-IR-NEXT: s_mul_i32 s3, s2, s9 +; GCN-IR-NEXT: s_sub_i32 s0, s0, s3 +; GCN-IR-NEXT: s_add_i32 s8, s2, 1 +; GCN-IR-NEXT: s_sub_i32 s3, s0, s9 +; GCN-IR-NEXT: s_cmp_ge_u32 s0, s9 +; GCN-IR-NEXT: s_cselect_b32 s2, s8, s2 +; GCN-IR-NEXT: s_cselect_b32 s0, s3, s0 +; GCN-IR-NEXT: s_add_i32 s3, s2, 1 +; GCN-IR-NEXT: s_cmp_ge_u32 s0, s9 +; GCN-IR-NEXT: s_cselect_b32 s0, s3, s2 +; GCN-IR-NEXT: s_xor_b32 s0, s0, s1 +; GCN-IR-NEXT: s_sub_i32 s0, s0, s1 ; GCN-IR-NEXT: s_ashr_i32 s1, s0, 31 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s1 @@ -688,41 +680,38 @@ define amdgpu_kernel void @s_test_sdiv31_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 33 -; GCN-NEXT: s_ashr_i32 s8, s2, 31 -; GCN-NEXT: s_add_i32 s2, s2, s8 -; GCN-NEXT: s_xor_b32 s9, s2, s8 +; GCN-NEXT: s_ashr_i64 s[8:9], s[2:3], 33 +; GCN-NEXT: s_abs_i32 s9, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s2, 0, s9 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_mul_lo_u32 v1, s2, v0 ; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 33 -; GCN-NEXT: s_ashr_i32 s3, s2, 31 -; GCN-NEXT: s_add_i32 s2, s2, s3 +; GCN-NEXT: s_abs_i32 s0, s2 +; GCN-NEXT: s_xor_b32 s1, s2, s8 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 -; GCN-NEXT: s_xor_b32 s2, s2, s3 -; GCN-NEXT: s_xor_b32 s0, s3, s8 +; GCN-NEXT: s_ashr_i32 s1, s1, 31 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 -; GCN-NEXT: v_readfirstlane_b32 s1, v0 -; GCN-NEXT: s_mul_i32 s3, s1, s9 -; GCN-NEXT: s_sub_i32 s2, s2, s3 -; GCN-NEXT: s_add_i32 s8, s1, 1 -; GCN-NEXT: s_sub_i32 s3, s2, s9 -; GCN-NEXT: s_cmp_ge_u32 s2, s9 -; GCN-NEXT: s_cselect_b32 s1, s8, s1 -; GCN-NEXT: s_cselect_b32 s2, s3, s2 -; GCN-NEXT: s_add_i32 s3, s1, 1 -; GCN-NEXT: s_cmp_ge_u32 s2, s9 -; GCN-NEXT: s_cselect_b32 s1, s3, s1 -; GCN-NEXT: s_xor_b32 s1, s1, s0 -; GCN-NEXT: s_sub_i32 s0, s1, s0 +; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 +; GCN-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-NEXT: s_mul_i32 s3, s2, s9 +; GCN-NEXT: s_sub_i32 s0, s0, s3 +; GCN-NEXT: s_add_i32 s8, s2, 1 +; GCN-NEXT: s_sub_i32 s3, s0, s9 +; GCN-NEXT: s_cmp_ge_u32 s0, s9 +; GCN-NEXT: s_cselect_b32 s2, s8, s2 +; GCN-NEXT: s_cselect_b32 s0, s3, s0 +; GCN-NEXT: s_add_i32 s3, s2, 1 +; GCN-NEXT: s_cmp_ge_u32 s0, s9 +; GCN-NEXT: s_cselect_b32 s0, s3, s2 +; GCN-NEXT: s_xor_b32 s0, s0, s1 +; GCN-NEXT: s_sub_i32 s0, s0, s1 ; GCN-NEXT: s_ashr_i32 s1, s0, 31 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 @@ -735,41 +724,38 @@ define amdgpu_kernel void @s_test_sdiv31_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 33 -; GCN-IR-NEXT: s_ashr_i32 s8, s2, 31 -; GCN-IR-NEXT: s_add_i32 s2, s2, s8 -; GCN-IR-NEXT: s_xor_b32 s9, s2, s8 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[2:3], 33 +; GCN-IR-NEXT: s_abs_i32 s9, s8 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s9 ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_sub_i32 s2, 0, s9 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v1, s2, v0 ; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 33 -; GCN-IR-NEXT: s_ashr_i32 s3, s2, 31 -; GCN-IR-NEXT: s_add_i32 s2, s2, s3 +; GCN-IR-NEXT: s_abs_i32 s0, s2 +; GCN-IR-NEXT: s_xor_b32 s1, s2, s8 ; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 -; GCN-IR-NEXT: s_xor_b32 s2, s2, s3 -; GCN-IR-NEXT: s_xor_b32 s0, s3, s8 +; GCN-IR-NEXT: s_ashr_i32 s1, s1, 31 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-IR-NEXT: v_mul_hi_u32 v0, s2, v0 -; GCN-IR-NEXT: v_readfirstlane_b32 s1, v0 -; GCN-IR-NEXT: s_mul_i32 s3, s1, s9 -; GCN-IR-NEXT: s_sub_i32 s2, s2, s3 -; GCN-IR-NEXT: s_add_i32 s8, s1, 1 -; GCN-IR-NEXT: s_sub_i32 s3, s2, s9 -; GCN-IR-NEXT: s_cmp_ge_u32 s2, s9 -; GCN-IR-NEXT: s_cselect_b32 s1, s8, s1 -; GCN-IR-NEXT: s_cselect_b32 s2, s3, s2 -; GCN-IR-NEXT: s_add_i32 s3, s1, 1 -; GCN-IR-NEXT: s_cmp_ge_u32 s2, s9 -; GCN-IR-NEXT: s_cselect_b32 s1, s3, s1 -; GCN-IR-NEXT: s_xor_b32 s1, s1, s0 -; GCN-IR-NEXT: s_sub_i32 s0, s1, s0 +; GCN-IR-NEXT: v_mul_hi_u32 v0, s0, v0 +; GCN-IR-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-IR-NEXT: s_mul_i32 s3, s2, s9 +; GCN-IR-NEXT: s_sub_i32 s0, s0, s3 +; GCN-IR-NEXT: s_add_i32 s8, s2, 1 +; GCN-IR-NEXT: s_sub_i32 s3, s0, s9 +; GCN-IR-NEXT: s_cmp_ge_u32 s0, s9 +; GCN-IR-NEXT: s_cselect_b32 s2, s8, s2 +; GCN-IR-NEXT: s_cselect_b32 s0, s3, s0 +; GCN-IR-NEXT: s_add_i32 s3, s2, 1 +; GCN-IR-NEXT: s_cmp_ge_u32 s0, s9 +; GCN-IR-NEXT: s_cselect_b32 s0, s3, s2 +; GCN-IR-NEXT: s_xor_b32 s0, s0, s1 +; GCN-IR-NEXT: s_sub_i32 s0, s0, s1 ; GCN-IR-NEXT: s_ashr_i32 s1, s0, 31 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s1 @@ -856,41 +842,38 @@ define amdgpu_kernel void @s_test_sdiv25_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 39 -; GCN-NEXT: s_ashr_i32 s8, s2, 31 -; GCN-NEXT: s_add_i32 s2, s2, s8 -; GCN-NEXT: s_xor_b32 s9, s2, s8 +; GCN-NEXT: s_ashr_i64 s[8:9], s[2:3], 39 +; GCN-NEXT: s_abs_i32 s9, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s2, 0, s9 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_mul_lo_u32 v1, s2, v0 ; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 39 -; GCN-NEXT: s_ashr_i32 s3, s2, 31 -; GCN-NEXT: s_add_i32 s2, s2, s3 +; GCN-NEXT: s_abs_i32 s0, s2 +; GCN-NEXT: s_xor_b32 s1, s2, s8 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 -; GCN-NEXT: s_xor_b32 s2, s2, s3 -; GCN-NEXT: s_xor_b32 s0, s3, s8 +; GCN-NEXT: s_ashr_i32 s1, s1, 31 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 -; GCN-NEXT: v_readfirstlane_b32 s1, v0 -; GCN-NEXT: s_mul_i32 s3, s1, s9 -; GCN-NEXT: s_sub_i32 s2, s2, s3 -; GCN-NEXT: s_add_i32 s8, s1, 1 -; GCN-NEXT: s_sub_i32 s3, s2, s9 -; GCN-NEXT: s_cmp_ge_u32 s2, s9 -; GCN-NEXT: s_cselect_b32 s1, s8, s1 -; GCN-NEXT: s_cselect_b32 s2, s3, s2 -; GCN-NEXT: s_add_i32 s3, s1, 1 -; GCN-NEXT: s_cmp_ge_u32 s2, s9 -; GCN-NEXT: s_cselect_b32 s1, s3, s1 -; GCN-NEXT: s_xor_b32 s1, s1, s0 -; GCN-NEXT: s_sub_i32 s0, s1, s0 +; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 +; GCN-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-NEXT: s_mul_i32 s3, s2, s9 +; GCN-NEXT: s_sub_i32 s0, s0, s3 +; GCN-NEXT: s_add_i32 s8, s2, 1 +; GCN-NEXT: s_sub_i32 s3, s0, s9 +; GCN-NEXT: s_cmp_ge_u32 s0, s9 +; GCN-NEXT: s_cselect_b32 s2, s8, s2 +; GCN-NEXT: s_cselect_b32 s0, s3, s0 +; GCN-NEXT: s_add_i32 s3, s2, 1 +; GCN-NEXT: s_cmp_ge_u32 s0, s9 +; GCN-NEXT: s_cselect_b32 s0, s3, s2 +; GCN-NEXT: s_xor_b32 s0, s0, s1 +; GCN-NEXT: s_sub_i32 s0, s0, s1 ; GCN-NEXT: s_ashr_i32 s1, s0, 31 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 @@ -903,41 +886,38 @@ define amdgpu_kernel void @s_test_sdiv25_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 39 -; GCN-IR-NEXT: s_ashr_i32 s8, s2, 31 -; GCN-IR-NEXT: s_add_i32 s2, s2, s8 -; GCN-IR-NEXT: s_xor_b32 s9, s2, s8 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[2:3], 39 +; GCN-IR-NEXT: s_abs_i32 s9, s8 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s9 ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_sub_i32 s2, 0, s9 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v1, s2, v0 ; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 39 -; GCN-IR-NEXT: s_ashr_i32 s3, s2, 31 -; GCN-IR-NEXT: s_add_i32 s2, s2, s3 +; GCN-IR-NEXT: s_abs_i32 s0, s2 +; GCN-IR-NEXT: s_xor_b32 s1, s2, s8 ; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 -; GCN-IR-NEXT: s_xor_b32 s2, s2, s3 -; GCN-IR-NEXT: s_xor_b32 s0, s3, s8 +; GCN-IR-NEXT: s_ashr_i32 s1, s1, 31 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-IR-NEXT: v_mul_hi_u32 v0, s2, v0 -; GCN-IR-NEXT: v_readfirstlane_b32 s1, v0 -; GCN-IR-NEXT: s_mul_i32 s3, s1, s9 -; GCN-IR-NEXT: s_sub_i32 s2, s2, s3 -; GCN-IR-NEXT: s_add_i32 s8, s1, 1 -; GCN-IR-NEXT: s_sub_i32 s3, s2, s9 -; GCN-IR-NEXT: s_cmp_ge_u32 s2, s9 -; GCN-IR-NEXT: s_cselect_b32 s1, s8, s1 -; GCN-IR-NEXT: s_cselect_b32 s2, s3, s2 -; GCN-IR-NEXT: s_add_i32 s3, s1, 1 -; GCN-IR-NEXT: s_cmp_ge_u32 s2, s9 -; GCN-IR-NEXT: s_cselect_b32 s1, s3, s1 -; GCN-IR-NEXT: s_xor_b32 s1, s1, s0 -; GCN-IR-NEXT: s_sub_i32 s0, s1, s0 +; GCN-IR-NEXT: v_mul_hi_u32 v0, s0, v0 +; GCN-IR-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-IR-NEXT: s_mul_i32 s3, s2, s9 +; GCN-IR-NEXT: s_sub_i32 s0, s0, s3 +; GCN-IR-NEXT: s_add_i32 s8, s2, 1 +; GCN-IR-NEXT: s_sub_i32 s3, s0, s9 +; GCN-IR-NEXT: s_cmp_ge_u32 s0, s9 +; GCN-IR-NEXT: s_cselect_b32 s2, s8, s2 +; GCN-IR-NEXT: s_cselect_b32 s0, s3, s0 +; GCN-IR-NEXT: s_add_i32 s3, s2, 1 +; GCN-IR-NEXT: s_cmp_ge_u32 s0, s9 +; GCN-IR-NEXT: s_cselect_b32 s0, s3, s2 +; GCN-IR-NEXT: s_xor_b32 s0, s0, s1 +; GCN-IR-NEXT: s_sub_i32 s0, s0, s1 ; GCN-IR-NEXT: s_ashr_i32 s1, s0, 31 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s1 @@ -1100,11 +1080,11 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 % ; GCN-IR-NEXT: s_mov_b32 s3, s2 ; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 16 ; GCN-IR-NEXT: s_ashr_i32 s4, s5, 31 -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[2:3], s[6:7] +; GCN-IR-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] ; GCN-IR-NEXT: s_mov_b32 s5, s4 ; GCN-IR-NEXT: s_sub_u32 s12, s6, s2 ; GCN-IR-NEXT: s_subb_u32 s13, s7, s2 -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], s[8:9] +; GCN-IR-NEXT: s_xor_b64 s[6:7], s[8:9], s[4:5] ; GCN-IR-NEXT: s_sub_u32 s6, s6, s4 ; GCN-IR-NEXT: s_subb_u32 s7, s7, s4 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[6:7], 0 @@ -1310,7 +1290,7 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_ashr_i32 s4, s3, 31 ; GCN-IR-NEXT: s_mov_b32 s5, s4 -; GCN-IR-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3] +; GCN-IR-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] ; GCN-IR-NEXT: s_sub_u32 s2, s2, s4 ; GCN-IR-NEXT: s_subb_u32 s3, s3, s4 ; GCN-IR-NEXT: s_flbit_i32_b64 s14, s[2:3] @@ -1493,8 +1473,8 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) { ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v1 -; GCN-IR-NEXT: v_xor_b32_e32 v0, v12, v0 -; GCN-IR-NEXT: v_xor_b32_e32 v1, v12, v1 +; GCN-IR-NEXT: v_xor_b32_e32 v0, v0, v12 +; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v12 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v12 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc ; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 @@ -1686,8 +1666,8 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) { ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v1 -; GCN-IR-NEXT: v_xor_b32_e32 v0, v12, v0 -; GCN-IR-NEXT: v_xor_b32_e32 v1, v12, v1 +; GCN-IR-NEXT: v_xor_b32_e32 v0, v0, v12 +; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v12 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v12 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc ; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 @@ -1788,8 +1768,8 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) { ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v1 -; GCN-IR-NEXT: v_xor_b32_e32 v0, v10, v0 -; GCN-IR-NEXT: v_xor_b32_e32 v1, v10, v1 +; GCN-IR-NEXT: v_xor_b32_e32 v0, v0, v10 +; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v10 ; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v0, v10 ; GCN-IR-NEXT: v_subb_u32_e32 v5, vcc, v1, v10, vcc ; GCN-IR-NEXT: v_ffbh_u32_e32 v0, v4 diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index ed7f27b367fda..93fab7dff253b 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -335,11 +335,11 @@ define i64 @v_test_srem(i64 %x, i64 %y) { ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-IR-NEXT: v_ashrrev_i32_e32 v14, 31, v1 ; GCN-IR-NEXT: v_xor_b32_e32 v0, v0, v14 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v14 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v14 -; GCN-IR-NEXT: v_xor_b32_e32 v2, v2, v4 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v14, vcc +; GCN-IR-NEXT: v_xor_b32_e32 v2, v2, v4 ; GCN-IR-NEXT: v_xor_b32_e32 v3, v3, v4 ; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 ; GCN-IR-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc @@ -655,37 +655,34 @@ define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 39 -; GCN-NEXT: s_ashr_i32 s3, s2, 31 -; GCN-NEXT: s_add_i32 s2, s2, s3 -; GCN-NEXT: s_xor_b32 s8, s2, s3 +; GCN-NEXT: s_abs_i32 s8, s2 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s2, 0, s8 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_mul_lo_u32 v1, s2, v0 ; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 39 -; GCN-NEXT: s_ashr_i32 s3, s2, 31 -; GCN-NEXT: s_add_i32 s2, s2, s3 +; GCN-NEXT: s_abs_i32 s3, s2 +; GCN-NEXT: s_ashr_i32 s0, s2, 31 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 -; GCN-NEXT: s_xor_b32 s2, s2, s3 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: s_mul_i32 s0, s0, s8 -; GCN-NEXT: s_sub_i32 s0, s2, s0 -; GCN-NEXT: s_sub_i32 s1, s0, s8 -; GCN-NEXT: s_cmp_ge_u32 s0, s8 -; GCN-NEXT: s_cselect_b32 s0, s1, s0 -; GCN-NEXT: s_sub_i32 s1, s0, s8 -; GCN-NEXT: s_cmp_ge_u32 s0, s8 -; GCN-NEXT: s_cselect_b32 s0, s1, s0 -; GCN-NEXT: s_xor_b32 s0, s0, s3 -; GCN-NEXT: s_sub_i32 s0, s0, s3 +; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 +; GCN-NEXT: v_readfirstlane_b32 s1, v0 +; GCN-NEXT: s_mul_i32 s1, s1, s8 +; GCN-NEXT: s_sub_i32 s1, s3, s1 +; GCN-NEXT: s_sub_i32 s2, s1, s8 +; GCN-NEXT: s_cmp_ge_u32 s1, s8 +; GCN-NEXT: s_cselect_b32 s1, s2, s1 +; GCN-NEXT: s_sub_i32 s2, s1, s8 +; GCN-NEXT: s_cmp_ge_u32 s1, s8 +; GCN-NEXT: s_cselect_b32 s1, s2, s1 +; GCN-NEXT: s_xor_b32 s1, s1, s0 +; GCN-NEXT: s_sub_i32 s0, s1, s0 ; GCN-NEXT: s_ashr_i32 s1, s0, 31 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 @@ -699,37 +696,34 @@ define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 39 -; GCN-IR-NEXT: s_ashr_i32 s3, s2, 31 -; GCN-IR-NEXT: s_add_i32 s2, s2, s3 -; GCN-IR-NEXT: s_xor_b32 s8, s2, s3 +; GCN-IR-NEXT: s_abs_i32 s8, s2 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_sub_i32 s2, 0, s8 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: s_mov_b32 s5, s1 +; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v1, s2, v0 ; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 39 -; GCN-IR-NEXT: s_ashr_i32 s3, s2, 31 -; GCN-IR-NEXT: s_add_i32 s2, s2, s3 +; GCN-IR-NEXT: s_abs_i32 s3, s2 +; GCN-IR-NEXT: s_ashr_i32 s0, s2, 31 ; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 -; GCN-IR-NEXT: s_xor_b32 s2, s2, s3 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-IR-NEXT: v_mul_hi_u32 v0, s2, v0 -; GCN-IR-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-IR-NEXT: s_mul_i32 s0, s0, s8 -; GCN-IR-NEXT: s_sub_i32 s0, s2, s0 -; GCN-IR-NEXT: s_sub_i32 s1, s0, s8 -; GCN-IR-NEXT: s_cmp_ge_u32 s0, s8 -; GCN-IR-NEXT: s_cselect_b32 s0, s1, s0 -; GCN-IR-NEXT: s_sub_i32 s1, s0, s8 -; GCN-IR-NEXT: s_cmp_ge_u32 s0, s8 -; GCN-IR-NEXT: s_cselect_b32 s0, s1, s0 -; GCN-IR-NEXT: s_xor_b32 s0, s0, s3 -; GCN-IR-NEXT: s_sub_i32 s0, s0, s3 +; GCN-IR-NEXT: v_mul_hi_u32 v0, s3, v0 +; GCN-IR-NEXT: v_readfirstlane_b32 s1, v0 +; GCN-IR-NEXT: s_mul_i32 s1, s1, s8 +; GCN-IR-NEXT: s_sub_i32 s1, s3, s1 +; GCN-IR-NEXT: s_sub_i32 s2, s1, s8 +; GCN-IR-NEXT: s_cmp_ge_u32 s1, s8 +; GCN-IR-NEXT: s_cselect_b32 s1, s2, s1 +; GCN-IR-NEXT: s_sub_i32 s2, s1, s8 +; GCN-IR-NEXT: s_cmp_ge_u32 s1, s8 +; GCN-IR-NEXT: s_cselect_b32 s1, s2, s1 +; GCN-IR-NEXT: s_xor_b32 s1, s1, s0 +; GCN-IR-NEXT: s_sub_i32 s0, s1, s0 ; GCN-IR-NEXT: s_ashr_i32 s1, s0, 31 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s1 @@ -750,37 +744,34 @@ define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 33 -; GCN-NEXT: s_ashr_i32 s3, s2, 31 -; GCN-NEXT: s_add_i32 s2, s2, s3 -; GCN-NEXT: s_xor_b32 s8, s2, s3 +; GCN-NEXT: s_abs_i32 s8, s2 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s2, 0, s8 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_mul_lo_u32 v1, s2, v0 ; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 33 -; GCN-NEXT: s_ashr_i32 s3, s2, 31 -; GCN-NEXT: s_add_i32 s2, s2, s3 +; GCN-NEXT: s_abs_i32 s3, s2 +; GCN-NEXT: s_ashr_i32 s0, s2, 31 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 -; GCN-NEXT: s_xor_b32 s2, s2, s3 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: s_mul_i32 s0, s0, s8 -; GCN-NEXT: s_sub_i32 s0, s2, s0 -; GCN-NEXT: s_sub_i32 s1, s0, s8 -; GCN-NEXT: s_cmp_ge_u32 s0, s8 -; GCN-NEXT: s_cselect_b32 s0, s1, s0 -; GCN-NEXT: s_sub_i32 s1, s0, s8 -; GCN-NEXT: s_cmp_ge_u32 s0, s8 -; GCN-NEXT: s_cselect_b32 s0, s1, s0 -; GCN-NEXT: s_xor_b32 s0, s0, s3 -; GCN-NEXT: s_sub_i32 s0, s0, s3 +; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 +; GCN-NEXT: v_readfirstlane_b32 s1, v0 +; GCN-NEXT: s_mul_i32 s1, s1, s8 +; GCN-NEXT: s_sub_i32 s1, s3, s1 +; GCN-NEXT: s_sub_i32 s2, s1, s8 +; GCN-NEXT: s_cmp_ge_u32 s1, s8 +; GCN-NEXT: s_cselect_b32 s1, s2, s1 +; GCN-NEXT: s_sub_i32 s2, s1, s8 +; GCN-NEXT: s_cmp_ge_u32 s1, s8 +; GCN-NEXT: s_cselect_b32 s1, s2, s1 +; GCN-NEXT: s_xor_b32 s1, s1, s0 +; GCN-NEXT: s_sub_i32 s0, s1, s0 ; GCN-NEXT: s_ashr_i32 s1, s0, 31 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 @@ -794,37 +785,34 @@ define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 33 -; GCN-IR-NEXT: s_ashr_i32 s3, s2, 31 -; GCN-IR-NEXT: s_add_i32 s2, s2, s3 -; GCN-IR-NEXT: s_xor_b32 s8, s2, s3 +; GCN-IR-NEXT: s_abs_i32 s8, s2 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_sub_i32 s2, 0, s8 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: s_mov_b32 s5, s1 +; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v1, s2, v0 ; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 33 -; GCN-IR-NEXT: s_ashr_i32 s3, s2, 31 -; GCN-IR-NEXT: s_add_i32 s2, s2, s3 +; GCN-IR-NEXT: s_abs_i32 s3, s2 +; GCN-IR-NEXT: s_ashr_i32 s0, s2, 31 ; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 -; GCN-IR-NEXT: s_xor_b32 s2, s2, s3 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-IR-NEXT: v_mul_hi_u32 v0, s2, v0 -; GCN-IR-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-IR-NEXT: s_mul_i32 s0, s0, s8 -; GCN-IR-NEXT: s_sub_i32 s0, s2, s0 -; GCN-IR-NEXT: s_sub_i32 s1, s0, s8 -; GCN-IR-NEXT: s_cmp_ge_u32 s0, s8 -; GCN-IR-NEXT: s_cselect_b32 s0, s1, s0 -; GCN-IR-NEXT: s_sub_i32 s1, s0, s8 -; GCN-IR-NEXT: s_cmp_ge_u32 s0, s8 -; GCN-IR-NEXT: s_cselect_b32 s0, s1, s0 -; GCN-IR-NEXT: s_xor_b32 s0, s0, s3 -; GCN-IR-NEXT: s_sub_i32 s0, s0, s3 +; GCN-IR-NEXT: v_mul_hi_u32 v0, s3, v0 +; GCN-IR-NEXT: v_readfirstlane_b32 s1, v0 +; GCN-IR-NEXT: s_mul_i32 s1, s1, s8 +; GCN-IR-NEXT: s_sub_i32 s1, s3, s1 +; GCN-IR-NEXT: s_sub_i32 s2, s1, s8 +; GCN-IR-NEXT: s_cmp_ge_u32 s1, s8 +; GCN-IR-NEXT: s_cselect_b32 s1, s2, s1 +; GCN-IR-NEXT: s_sub_i32 s2, s1, s8 +; GCN-IR-NEXT: s_cmp_ge_u32 s1, s8 +; GCN-IR-NEXT: s_cselect_b32 s1, s2, s1 +; GCN-IR-NEXT: s_xor_b32 s1, s1, s0 +; GCN-IR-NEXT: s_sub_i32 s0, s1, s0 ; GCN-IR-NEXT: s_ashr_i32 s1, s0, 31 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s1 @@ -845,36 +833,33 @@ define amdgpu_kernel void @s_test_srem32_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i32 s3, s2, 31 -; GCN-NEXT: s_add_i32 s2, s2, s3 -; GCN-NEXT: s_xor_b32 s8, s2, s3 +; GCN-NEXT: s_abs_i32 s8, s2 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s2, 0, s8 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_mul_lo_u32 v1, s2, v0 -; GCN-NEXT: s_ashr_i32 s2, s3, 31 -; GCN-NEXT: s_add_i32 s3, s3, s2 -; GCN-NEXT: s_xor_b32 s3, s3, s2 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_abs_i32 s2, s3 +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: s_mul_i32 s0, s0, s8 -; GCN-NEXT: s_sub_i32 s0, s3, s0 -; GCN-NEXT: s_sub_i32 s1, s0, s8 -; GCN-NEXT: s_cmp_ge_u32 s0, s8 -; GCN-NEXT: s_cselect_b32 s0, s1, s0 -; GCN-NEXT: s_sub_i32 s1, s0, s8 -; GCN-NEXT: s_cmp_ge_u32 s0, s8 -; GCN-NEXT: s_cselect_b32 s0, s1, s0 -; GCN-NEXT: s_xor_b32 s0, s0, s2 -; GCN-NEXT: s_sub_i32 s0, s0, s2 +; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_ashr_i32 s0, s3, 31 +; GCN-NEXT: v_readfirstlane_b32 s1, v0 +; GCN-NEXT: s_mul_i32 s1, s1, s8 +; GCN-NEXT: s_sub_i32 s1, s2, s1 +; GCN-NEXT: s_sub_i32 s2, s1, s8 +; GCN-NEXT: s_cmp_ge_u32 s1, s8 +; GCN-NEXT: s_cselect_b32 s1, s2, s1 +; GCN-NEXT: s_sub_i32 s2, s1, s8 +; GCN-NEXT: s_cmp_ge_u32 s1, s8 +; GCN-NEXT: s_cselect_b32 s1, s2, s1 +; GCN-NEXT: s_xor_b32 s1, s1, s0 +; GCN-NEXT: s_sub_i32 s0, s1, s0 ; GCN-NEXT: s_ashr_i32 s1, s0, 31 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 @@ -887,36 +872,33 @@ define amdgpu_kernel void @s_test_srem32_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i32 s3, s2, 31 -; GCN-IR-NEXT: s_add_i32 s2, s2, s3 -; GCN-IR-NEXT: s_xor_b32 s8, s2, s3 +; GCN-IR-NEXT: s_abs_i32 s8, s2 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_sub_i32 s2, 0, s8 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v1, s2, v0 -; GCN-IR-NEXT: s_ashr_i32 s2, s3, 31 -; GCN-IR-NEXT: s_add_i32 s3, s3, s2 -; GCN-IR-NEXT: s_xor_b32 s3, s3, s2 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_abs_i32 s2, s3 +; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-IR-NEXT: v_mul_hi_u32 v0, s3, v0 -; GCN-IR-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-IR-NEXT: s_mul_i32 s0, s0, s8 -; GCN-IR-NEXT: s_sub_i32 s0, s3, s0 -; GCN-IR-NEXT: s_sub_i32 s1, s0, s8 -; GCN-IR-NEXT: s_cmp_ge_u32 s0, s8 -; GCN-IR-NEXT: s_cselect_b32 s0, s1, s0 -; GCN-IR-NEXT: s_sub_i32 s1, s0, s8 -; GCN-IR-NEXT: s_cmp_ge_u32 s0, s8 -; GCN-IR-NEXT: s_cselect_b32 s0, s1, s0 -; GCN-IR-NEXT: s_xor_b32 s0, s0, s2 -; GCN-IR-NEXT: s_sub_i32 s0, s0, s2 +; GCN-IR-NEXT: v_mul_hi_u32 v0, s2, v0 +; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: s_ashr_i32 s0, s3, 31 +; GCN-IR-NEXT: v_readfirstlane_b32 s1, v0 +; GCN-IR-NEXT: s_mul_i32 s1, s1, s8 +; GCN-IR-NEXT: s_sub_i32 s1, s2, s1 +; GCN-IR-NEXT: s_sub_i32 s2, s1, s8 +; GCN-IR-NEXT: s_cmp_ge_u32 s1, s8 +; GCN-IR-NEXT: s_cselect_b32 s1, s2, s1 +; GCN-IR-NEXT: s_sub_i32 s2, s1, s8 +; GCN-IR-NEXT: s_cmp_ge_u32 s1, s8 +; GCN-IR-NEXT: s_cselect_b32 s1, s2, s1 +; GCN-IR-NEXT: s_xor_b32 s1, s1, s0 +; GCN-IR-NEXT: s_sub_i32 s0, s1, s0 ; GCN-IR-NEXT: s_ashr_i32 s1, s0, 31 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s1 @@ -1074,19 +1056,19 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[6:7], 31 ; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[0:1], 31 ; GCN-IR-NEXT: s_ashr_i32 s0, s3, 31 -; GCN-IR-NEXT: s_ashr_i32 s10, s7, 31 ; GCN-IR-NEXT: s_mov_b32 s1, s0 -; GCN-IR-NEXT: s_mov_b32 s11, s10 ; GCN-IR-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] ; GCN-IR-NEXT: s_sub_u32 s2, s2, s0 ; GCN-IR-NEXT: s_subb_u32 s3, s3, s0 +; GCN-IR-NEXT: s_ashr_i32 s10, s7, 31 +; GCN-IR-NEXT: s_mov_b32 s11, s10 +; GCN-IR-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] ; GCN-IR-NEXT: s_sub_u32 s8, s6, s10 ; GCN-IR-NEXT: s_subb_u32 s9, s7, s10 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[2:3], 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[8:9], 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[2:3], 0 ; GCN-IR-NEXT: s_flbit_i32_b64 s12, s[8:9] -; GCN-IR-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11] +; GCN-IR-NEXT: s_or_b64 s[10:11], s[6:7], s[14:15] ; GCN-IR-NEXT: s_flbit_i32_b64 s20, s[2:3] ; GCN-IR-NEXT: s_sub_u32 s14, s12, s20 ; GCN-IR-NEXT: s_subb_u32 s15, 0, 0 @@ -1215,17 +1197,17 @@ define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 % ; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[4:5], 24 ; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 24 ; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 -; GCN-IR-NEXT: s_lshl_b64 s[4:5], s[4:5], 16 -; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[2:3], 16 +; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[4:5], 16 +; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[2:3], 16 ; GCN-IR-NEXT: s_ashr_i32 s2, s3, 31 -; GCN-IR-NEXT: s_ashr_i32 s10, s5, 31 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 16 ; GCN-IR-NEXT: s_mov_b32 s3, s2 -; GCN-IR-NEXT: s_mov_b32 s11, s10 -; GCN-IR-NEXT: s_xor_b64 s[4:5], s[6:7], s[2:3] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[8:9], s[10:11] +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[6:7], 16 +; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] ; GCN-IR-NEXT: s_sub_u32 s4, s4, s2 ; GCN-IR-NEXT: s_subb_u32 s5, s5, s2 +; GCN-IR-NEXT: s_ashr_i32 s10, s7, 31 +; GCN-IR-NEXT: s_mov_b32 s11, s10 +; GCN-IR-NEXT: s_xor_b64 s[6:7], s[8:9], s[10:11] ; GCN-IR-NEXT: s_sub_u32 s6, s6, s10 ; GCN-IR-NEXT: s_subb_u32 s7, s7, s10 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[6:7], 0 diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll index 33cc8e96f663f..1c303de55c95d 100644 --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll @@ -177,56 +177,55 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $152, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: subl $156, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %eax ; X86-NEXT: sarl $31, %eax -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: sarl $31, %ebx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: xorl %ecx, %esi -; X86-NEXT: movl %esi, %ebp -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: xorl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %edi -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: xorl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: xorl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %eax, %ecx +; X86-NEXT: xorl %eax, %esi +; X86-NEXT: movl %esi, %edi +; X86-NEXT: xorl %eax, %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: xorl %eax, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: xorl %eax, %ecx +; X86-NEXT: subl %eax, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl %eax, %edi ; X86-NEXT: movl %edi, (%esp) # 4-byte Spill -; X86-NEXT: sbbl %eax, %ebp -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: sarl $31, %edx +; X86-NEXT: movl %ecx, %esi ; X86-NEXT: xorl %edx, %esi -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: xorl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: xorl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %ebx, %ebp -; X86-NEXT: xorl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: subl %ebx, %ebp -; X86-NEXT: sbbl %ebx, %edi -; X86-NEXT: sbbl %ebx, %edx -; X86-NEXT: sbbl %ebx, %esi -; X86-NEXT: xorl %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: xorl %edx, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: xorl %edx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: xorl %edx, %edi +; X86-NEXT: subl %edx, %edi +; X86-NEXT: sbbl %edx, %ebp +; X86-NEXT: sbbl %edx, %ebx +; X86-NEXT: sbbl %edx, %esi +; X86-NEXT: xorl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: orl %esi, %eax -; X86-NEXT: movl %ebp, %ecx -; X86-NEXT: orl %edx, %ecx -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %edi, %ecx +; X86-NEXT: orl %ebx, %ecx ; X86-NEXT: orl %eax, %ecx ; X86-NEXT: sete %cl ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: orl (%esp), %eax # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: orl (%esp), %edx # 4-byte Folded Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: orl %eax, %edx ; X86-NEXT: sete %al ; X86-NEXT: orb %cl, %al @@ -238,76 +237,78 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: addl $32, %ecx ; X86-NEXT: testl %esi, %esi ; X86-NEXT: cmovnel %edx, %ecx -; X86-NEXT: bsrl %edi, %edx +; X86-NEXT: bsrl %ebp, %edx ; X86-NEXT: xorl $31, %edx -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: bsrl %ebp, %ebp -; X86-NEXT: xorl $31, %ebp -; X86-NEXT: addl $32, %ebp ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: testl %edi, %edi -; X86-NEXT: cmovnel %edx, %ebp -; X86-NEXT: addl $64, %ebp +; X86-NEXT: bsrl %edi, %edi +; X86-NEXT: xorl $31, %edi +; X86-NEXT: addl $32, %edi +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: testl %ebp, %ebp +; X86-NEXT: cmovnel %edx, %edi +; X86-NEXT: addl $64, %edi ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ebx, %edx ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: cmovnel %ecx, %ebp -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: orl %esi, %edx +; X86-NEXT: cmovnel %ecx, %edi +; X86-NEXT: movl (%esp), %ebx # 4-byte Reload ; X86-NEXT: bsrl %ebx, %edx ; X86-NEXT: xorl $31, %edx -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: bsrl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: bsrl %ebp, %ecx ; X86-NEXT: xorl $31, %ecx ; X86-NEXT: addl $32, %ecx ; X86-NEXT: testl %ebx, %ebx ; X86-NEXT: cmovnel %edx, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: bsrl %edi, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: bsrl %eax, %esi ; X86-NEXT: xorl $31, %esi ; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: xorl $31, %edx ; X86-NEXT: addl $32, %edx -; X86-NEXT: testl %edi, %edi +; X86-NEXT: testl %eax, %eax ; X86-NEXT: cmovnel %esi, %edx ; X86-NEXT: addl $64, %edx -; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl %ebp, %esi ; X86-NEXT: orl %ebx, %esi ; X86-NEXT: cmovnel %ecx, %edx -; X86-NEXT: xorl %esi, %esi -; X86-NEXT: subl %edx, %ebp -; X86-NEXT: movl $0, %edi -; X86-NEXT: sbbl %edi, %edi +; X86-NEXT: xorl %ebx, %ebx +; X86-NEXT: subl %edx, %edi ; X86-NEXT: movl $0, %edx ; X86-NEXT: sbbl %edx, %edx ; X86-NEXT: movl $0, %eax ; X86-NEXT: sbbl %eax, %eax +; X86-NEXT: movl $0, %esi +; X86-NEXT: sbbl %esi, %esi ; X86-NEXT: movl $127, %ecx -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpl %ebp, %ecx -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl $0, %ecx -; X86-NEXT: sbbl %edi, %ecx +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpl %edi, %ecx ; X86-NEXT: movl $0, %ecx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl %edx, %ecx ; X86-NEXT: movl $0, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl %eax, %ecx +; X86-NEXT: movl $0, %ecx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %esi, %ecx ; X86-NEXT: setb %cl ; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: cmovnel %esi, %edx -; X86-NEXT: movl (%esp), %ebx # 4-byte Reload -; X86-NEXT: cmovnel %esi, %ebx +; X86-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-NEXT: cmovnel %ebx, %edx +; X86-NEXT: cmovnel %ebx, %ebp ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: cmovnel %esi, %eax -; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: cmovnel %ebx, %eax +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: movl %ebx, %esi ; X86-NEXT: jne .LBB4_8 ; X86-NEXT: # %bb.1: # %_udiv-special-cases -; X86-NEXT: movl %edi, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: xorl $127, %edi -; X86-NEXT: orl %ebp, %edi -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl %ebx, %ecx ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: orl %edi, %ecx ; X86-NEXT: je .LBB4_8 @@ -316,10 +317,10 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -333,27 +334,27 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: andb $15, %al ; X86-NEXT: negb %al ; X86-NEXT: movsbl %al, %edi -; X86-NEXT: movl 144(%esp,%edi), %edx -; X86-NEXT: movl 148(%esp,%edi), %esi +; X86-NEXT: movl 148(%esp,%edi), %edx +; X86-NEXT: movl 152(%esp,%edi), %esi ; X86-NEXT: movb %ch, %cl ; X86-NEXT: shldl %cl, %edx, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: shll %cl, %edx ; X86-NEXT: notb %cl -; X86-NEXT: movl 140(%esp,%edi), %eax -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: shrl %ebx -; X86-NEXT: shrl %cl, %ebx -; X86-NEXT: orl %edx, %ebx -; X86-NEXT: movl 136(%esp,%edi), %edx +; X86-NEXT: movl 144(%esp,%edi), %eax +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: shrl %ebp +; X86-NEXT: shrl %cl, %ebp +; X86-NEXT: orl %edx, %ebp +; X86-NEXT: movl 140(%esp,%edi), %edx ; X86-NEXT: movb %ch, %cl ; X86-NEXT: shldl %cl, %edx, %eax ; X86-NEXT: shll %cl, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl $1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: adcl $0, %ebp +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: adcl $0, %edx ; X86-NEXT: jae .LBB4_3 @@ -363,38 +364,37 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: jmp .LBB4_7 ; X86-NEXT: .LBB4_3: # %udiv-preheader -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NEXT: movl (%esp), %esi # 4-byte Reload -; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl (%esp), %esi # 4-byte Reload +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movb %dl, %ch ; X86-NEXT: andb $7, %ch ; X86-NEXT: movb %dl, %cl ; X86-NEXT: shrb $3, %cl ; X86-NEXT: andb $15, %cl ; X86-NEXT: movzbl %cl, %edx -; X86-NEXT: movl 100(%esp,%edx), %esi -; X86-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-NEXT: movl 96(%esp,%edx), %edi -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 104(%esp,%edx), %ebx +; X86-NEXT: movl 100(%esp,%edx), %edi +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edi, %ebp ; X86-NEXT: movb %ch, %cl -; X86-NEXT: shrdl %cl, %esi, %ebp -; X86-NEXT: movl 88(%esp,%edx), %ebx +; X86-NEXT: shrdl %cl, %ebx, %ebp ; X86-NEXT: movl 92(%esp,%edx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 96(%esp,%edx), %esi ; X86-NEXT: movl %esi, %edx ; X86-NEXT: shrl %cl, %edx ; X86-NEXT: notb %cl @@ -403,9 +403,9 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: orl %edx, %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movb %ch, %cl -; X86-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill -; X86-NEXT: shrdl %cl, %esi, %ebx +; X86-NEXT: shrl %cl, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: addl $-1, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -415,7 +415,6 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: adcl $-1, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: adcl $-1, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -424,26 +423,26 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: .p2align 4, 0x90 ; X86-NEXT: .LBB4_4: # %udiv-do-while ; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebp, %esi -; X86-NEXT: shldl $1, %ebp, (%esp) # 4-byte Folded Spill +; X86-NEXT: movl %ebp, (%esp) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shldl $1, %ebp, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: shldl $1, %ebp, %esi +; X86-NEXT: shldl $1, %ebp, (%esp) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: shldl $1, %edx, %ebp ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: shldl $1, %edi, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: shldl $1, %ecx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: orl %ebx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: orl %esi, %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: shldl $1, %eax, %ecx -; X86-NEXT: orl %ebx, %ecx +; X86-NEXT: orl %esi, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: shldl $1, %ecx, %eax -; X86-NEXT: orl %ebx, %eax +; X86-NEXT: orl %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl %ecx, %ecx ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload @@ -452,15 +451,15 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: sbbl %ebp, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: sbbl %esi, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: sbbl (%esp), %ecx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: sbbl %ebx, %ecx ; X86-NEXT: sarl $31, %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: andl $1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: movl %ecx, %edi ; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: movl %ecx, %eax @@ -468,21 +467,22 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: subl %ecx, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: sbbl %eax, %ebp ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %edi, %esi -; X86-NEXT: movl %esi, %ebp +; X86-NEXT: movl (%esp), %ebp # 4-byte Reload +; X86-NEXT: sbbl %edi, %ebp ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: sbbl %ebx, (%esp) # 4-byte Folded Spill +; X86-NEXT: sbbl %esi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: addl $-1, %ecx -; X86-NEXT: adcl $-1, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: adcl $-1, %edi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: adcl $-1, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: adcl $-1, %esi -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: orl %esi, %edi ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -491,15 +491,15 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: orl %edi, %ecx ; X86-NEXT: jne .LBB4_4 ; X86-NEXT: # %bb.5: -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: .LBB4_7: # %udiv-loop-exit -; X86-NEXT: shldl $1, %ebx, %edx +; X86-NEXT: shldl $1, %ebp, %edx ; X86-NEXT: orl %ecx, %edx -; X86-NEXT: shldl $1, %eax, %ebx -; X86-NEXT: orl %ecx, %ebx +; X86-NEXT: shldl $1, %eax, %ebp +; X86-NEXT: orl %ecx, %ebp ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: shldl $1, %esi, %eax ; X86-NEXT: orl %ecx, %eax @@ -508,36 +508,35 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: .LBB4_8: # %udiv-end ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: xorl %ecx, %edx -; X86-NEXT: xorl %ecx, %ebx +; X86-NEXT: xorl %ecx, %ebp ; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: xorl %ecx, %esi ; X86-NEXT: subl %ecx, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %ecx, %ebx +; X86-NEXT: sbbl %ecx, %ebp ; X86-NEXT: sbbl %ecx, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %esi, (%ecx) ; X86-NEXT: movl %eax, 4(%ecx) -; X86-NEXT: movl %ebx, 8(%ecx) +; X86-NEXT: movl %ebp, 8(%ecx) ; X86-NEXT: movl %edx, 12(%ecx) ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebx, %edi +; X86-NEXT: movl %ebp, %edi ; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: addl %ebp, %ecx ; X86-NEXT: adcl $0, %ebx ; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: mull %ebp ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl %ebx, %edx @@ -545,7 +544,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: setb %cl ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: mull %ebp ; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movzbl %cl, %eax @@ -556,35 +555,36 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: imull %eax, %ecx ; X86-NEXT: mull %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: imull {{[0-9]+}}(%esp), %edi +; X86-NEXT: imull %ebp, %edi ; X86-NEXT: addl %edx, %edi ; X86-NEXT: addl %ecx, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: imull %esi, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: imull %edx, %ebp +; X86-NEXT: imull %edx, %esi ; X86-NEXT: mull %edx -; X86-NEXT: addl %edx, %ebp -; X86-NEXT: addl %ecx, %ebp +; X86-NEXT: addl %edx, %esi +; X86-NEXT: addl %ecx, %esi ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: adcl %edi, %ebp +; X86-NEXT: adcl %edi, %esi ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: subl (%esp), %edx # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl %eax, %esi +; X86-NEXT: subl (%esp), %ecx # 4-byte Folded Reload +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: sbbl %eax, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: sbbl %ebp, %edi +; X86-NEXT: sbbl %esi, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %edx, (%eax) -; X86-NEXT: movl %ecx, 4(%eax) -; X86-NEXT: movl %esi, 8(%eax) +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %ebx, 8(%eax) ; X86-NEXT: movl %edi, 12(%eax) -; X86-NEXT: addl $152, %esp +; X86-NEXT: addl $156, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/pr38539.ll b/llvm/test/CodeGen/X86/pr38539.ll index ace78b38d53ed..fbc363f77ec42 100644 --- a/llvm/test/CodeGen/X86/pr38539.ll +++ b/llvm/test/CodeGen/X86/pr38539.ll @@ -36,9 +36,9 @@ define void @f() nounwind { ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: sarl $30, %ecx ; X86-NEXT: sarl $31, %eax -; X86-NEXT: shrdl $1, %eax, %ecx ; X86-NEXT: xorl %eax, %edx ; X86-NEXT: xorl %eax, %edi +; X86-NEXT: shrdl $1, %eax, %ecx ; X86-NEXT: xorl %ecx, %esi ; X86-NEXT: subl %ecx, %esi ; X86-NEXT: sbbl %eax, %edi