diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index b5ceaaa14b4fd..804ffb90b5302 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -595,11 +595,15 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { break; uint64_t Imm; - if (ConstantFPSDNode *FP = dyn_cast(N)) + if (ConstantFPSDNode *FP = dyn_cast(N)) { Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue(); - else { + if (AMDGPU::isValid32BitLiteral(Imm, true)) + break; + } else { ConstantSDNode *C = cast(N); Imm = C->getZExtValue(); + if (AMDGPU::isValid32BitLiteral(Imm, false)) + break; } SDLoc DL(N); @@ -3014,7 +3018,7 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const { if (!RC || SIRI->isSGPRClass(RC)) return false; - if (RC != &AMDGPU::VS_32RegClass) { + if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass) { AllUsesAcceptSReg = false; SDNode * User = *U; if (User->isMachineOpcode()) { @@ -3026,7 +3030,8 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const { if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) { unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs(); const TargetRegisterClass *CommutedRC = getOperandRegClass(*U, CommutedOpNo); - if (CommutedRC == &AMDGPU::VS_32RegClass) + if (CommutedRC == &AMDGPU::VS_32RegClass || + CommutedRC == &AMDGPU::VS_64RegClass) AllUsesAcceptSReg = true; } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 31d72fb8cadd8..2cf60f338105b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2551,11 +2551,13 @@ bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { MachineOperand &ImmOp = I.getOperand(1); Register DstReg = I.getOperand(0).getReg(); unsigned Size = MRI->getType(DstReg).getSizeInBits(); + bool IsFP = false; // The AMDGPU backend only supports Imm operands and not CImm or FPImm. if (ImmOp.isFPImm()) { const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt(); ImmOp.ChangeToImmediate(Imm.getZExtValue()); + IsFP = true; } else if (ImmOp.isCImm()) { ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue()); } else { @@ -2568,6 +2570,12 @@ bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { unsigned Opcode; if (DstRB->getID() == AMDGPU::VCCRegBankID) { Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + } else if (Size == 64 && + AMDGPU::isValid32BitLiteral(I.getOperand(1).getImm(), IsFP)) { + Opcode = IsSgpr ? AMDGPU::S_MOV_B64_IMM_PSEUDO : AMDGPU::V_MOV_B64_PSEUDO; + I.setDesc(TII.get(Opcode)); + I.addImplicitDefUseOperands(*MF); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } else { Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index b32ed9fef5dd3..b7ac90e33f65e 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -367,7 +367,7 @@ static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy, SMovOp = AMDGPU::S_MOV_B32; break; case AMDGPU::V_MOV_B64_PSEUDO: - SMovOp = AMDGPU::S_MOV_B64; + SMovOp = AMDGPU::S_MOV_B64_IMM_PSEUDO; break; } Imm = ImmOp->getImm(); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index fae43bf30a3f6..f1e375ee52cb8 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -5611,9 +5611,18 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 || OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 || OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32; - if (Is64BitOp && !AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) && - !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) - return false; + if (Is64BitOp && + !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) { + if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp)) + return false; + + // FIXME: We can use sign extended 64-bit literals, but only for signed + // operands. At the moment we do not know if an operand is signed. + // Such operand will be encoded as its low 32 bits and then either + // correctly sign extended or incorrectly zero extended by HW. + if (!Is64BitFPOp && (int32_t)Imm < 0) + return false; + } } // Handle non-register types that are treated like immediates. diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 3cd0821b0f86c..60fdedb687357 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1956,6 +1956,29 @@ def : GCNPat < (V_MOV_B32_e32 (f16 (bitcast_fpimm_to_i32 $imm))) >; +// V_MOV_B64_PSEUDO and S_MOV_B64_IMM_PSEUDO can be used with any 64-bit +// immediate and wil be expanded as needed, but we will only use these patterns +// for values which can be encoded. +def : GCNPat < + (VGPRImm<(i64 imm)>:$imm), + (V_MOV_B64_PSEUDO imm:$imm) +>; + +def : GCNPat < + (VGPRImm<(f64 fpimm)>:$imm), + (V_MOV_B64_PSEUDO (f64 (bitcast_fpimm_to_i64 $imm))) +>; + +def : GCNPat < + (i64 imm:$imm), + (S_MOV_B64_IMM_PSEUDO imm:$imm) +>; + +def : GCNPat < + (f64 fpimm:$imm), + (S_MOV_B64_IMM_PSEUDO (i64 (bitcast_fpimm_to_i64 fpimm:$imm))) +>; + def : GCNPat < (f32 fpimm:$imm), (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm))) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll index ed525fb83c6de..621394fd290b0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll @@ -41,11 +41,12 @@ entry: } ; GCN-LABEL: {{^}}v_clamp_i64_i16_invalid_lower +; GFX6789: v_mov_b32_e32 v{{[0-9]+}}, 0x8001 ; GFX6789: v_mov_b32_e32 [[B:v[0-9]+]], 0x8001 ; GFX6789: v_cndmask_b32_e32 [[A:v[0-9]+]], [[B]], [[A]], vcc ; GFX6789: v_cndmask_b32_e32 [[C:v[0-9]+]], 0, [[C]], vcc -; GFX10: v_cndmask_b32_e32 [[A:v[0-9]+]], 0x8001, [[A]], vcc_lo +; GFX10: v_{{(dual_)?}}cndmask_b32{{(_e32)?}} [[A:v[0-9]+]], 0x8001, [[A]] ; GFX10: v_cndmask_b32_e32 [[B:v[0-9]+]], 0, [[B]], vcc_lo define i16 @v_clamp_i64_i16_invalid_lower(i64 %in) #0 { entry: @@ -56,6 +57,7 @@ entry: } ; GCN-LABEL: {{^}}v_clamp_i64_i16_invalid_lower_and_higher +; GFX6789: v_mov_b32_e32 v{{[0-9]+}}, 0x8000 ; GFX6789: v_mov_b32_e32 [[B:v[0-9]+]], 0x8000 ; GFX6789: v_cndmask_b32_e32 [[A:v[0-9]+]], [[B]], [[A]], vcc ; GFX10: v_cndmask_b32_e32 [[A:v[0-9]+]], 0x8000, [[A]], vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll index 701a733d9e8e9..8bf34caea4051 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -2090,69 +2090,69 @@ define amdgpu_ps double @dyn_extract_v16f64_s_s(i32 inreg %sel) { ; GCN-LABEL: dyn_extract_v16f64_s_s: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_mov_b32 s66, 0 +; GCN-NEXT: s_mov_b32 s64, 0 +; GCN-NEXT: s_mov_b32 s62, 0 +; GCN-NEXT: s_mov_b32 s60, 0 +; GCN-NEXT: s_mov_b32 s58, 0 +; GCN-NEXT: s_mov_b32 s56, 0 +; GCN-NEXT: s_mov_b32 s54, 0 +; GCN-NEXT: s_mov_b32 s52, 0 +; GCN-NEXT: s_mov_b32 s50, 0 +; GCN-NEXT: s_mov_b32 s48, 0 +; GCN-NEXT: s_mov_b32 s46, 0 +; GCN-NEXT: s_mov_b32 s44, 0 +; GCN-NEXT: s_mov_b32 s40, 0 ; GCN-NEXT: s_mov_b64 s[36:37], 1.0 ; GCN-NEXT: s_mov_b32 m0, s2 ; GCN-NEXT: s_mov_b32 s67, 0x40300000 ; GCN-NEXT: s_mov_b32 s65, 0x402e0000 -; GCN-NEXT: s_mov_b32 s64, s66 ; GCN-NEXT: s_mov_b32 s63, 0x402c0000 -; GCN-NEXT: s_mov_b32 s62, s66 ; GCN-NEXT: s_mov_b32 s61, 0x402a0000 -; GCN-NEXT: s_mov_b32 s60, s66 ; GCN-NEXT: s_mov_b32 s59, 0x40280000 -; GCN-NEXT: s_mov_b32 s58, s66 ; GCN-NEXT: s_mov_b32 s57, 0x40260000 -; GCN-NEXT: s_mov_b32 s56, s66 ; GCN-NEXT: s_mov_b32 s55, 0x40240000 -; GCN-NEXT: s_mov_b32 s54, s66 ; GCN-NEXT: s_mov_b32 s53, 0x40220000 -; GCN-NEXT: s_mov_b32 s52, s66 ; GCN-NEXT: s_mov_b32 s51, 0x40200000 -; GCN-NEXT: s_mov_b32 s50, s66 ; GCN-NEXT: s_mov_b32 s49, 0x401c0000 -; GCN-NEXT: s_mov_b32 s48, s66 ; GCN-NEXT: s_mov_b32 s47, 0x40180000 -; GCN-NEXT: s_mov_b32 s46, s66 ; GCN-NEXT: s_mov_b32 s45, 0x40140000 -; GCN-NEXT: s_mov_b32 s44, s66 ; GCN-NEXT: s_mov_b64 s[42:43], 4.0 ; GCN-NEXT: s_mov_b32 s41, 0x40080000 -; GCN-NEXT: s_mov_b32 s40, s66 ; GCN-NEXT: s_mov_b64 s[38:39], 2.0 ; GCN-NEXT: s_movrels_b64 s[0:1], s[36:37] ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: dyn_extract_v16f64_s_s: ; GFX10PLUS: ; %bb.0: ; %entry -; GFX10PLUS-NEXT: s_mov_b32 s66, 0 ; GFX10PLUS-NEXT: s_mov_b64 s[36:37], 1.0 ; GFX10PLUS-NEXT: s_mov_b32 m0, s2 +; GFX10PLUS-NEXT: s_mov_b32 s66, 0 +; GFX10PLUS-NEXT: s_mov_b32 s64, 0 +; GFX10PLUS-NEXT: s_mov_b32 s62, 0 +; GFX10PLUS-NEXT: s_mov_b32 s60, 0 +; GFX10PLUS-NEXT: s_mov_b32 s58, 0 +; GFX10PLUS-NEXT: s_mov_b32 s56, 0 +; GFX10PLUS-NEXT: s_mov_b32 s54, 0 +; GFX10PLUS-NEXT: s_mov_b32 s52, 0 +; GFX10PLUS-NEXT: s_mov_b32 s50, 0 +; GFX10PLUS-NEXT: s_mov_b32 s48, 0 +; GFX10PLUS-NEXT: s_mov_b32 s46, 0 +; GFX10PLUS-NEXT: s_mov_b32 s44, 0 +; GFX10PLUS-NEXT: s_mov_b32 s40, 0 ; GFX10PLUS-NEXT: s_mov_b32 s67, 0x40300000 ; GFX10PLUS-NEXT: s_mov_b32 s65, 0x402e0000 -; GFX10PLUS-NEXT: s_mov_b32 s64, s66 ; GFX10PLUS-NEXT: s_mov_b32 s63, 0x402c0000 -; GFX10PLUS-NEXT: s_mov_b32 s62, s66 ; GFX10PLUS-NEXT: s_mov_b32 s61, 0x402a0000 -; GFX10PLUS-NEXT: s_mov_b32 s60, s66 ; GFX10PLUS-NEXT: s_mov_b32 s59, 0x40280000 -; GFX10PLUS-NEXT: s_mov_b32 s58, s66 ; GFX10PLUS-NEXT: s_mov_b32 s57, 0x40260000 -; GFX10PLUS-NEXT: s_mov_b32 s56, s66 ; GFX10PLUS-NEXT: s_mov_b32 s55, 0x40240000 -; GFX10PLUS-NEXT: s_mov_b32 s54, s66 ; GFX10PLUS-NEXT: s_mov_b32 s53, 0x40220000 -; GFX10PLUS-NEXT: s_mov_b32 s52, s66 ; GFX10PLUS-NEXT: s_mov_b32 s51, 0x40200000 -; GFX10PLUS-NEXT: s_mov_b32 s50, s66 ; GFX10PLUS-NEXT: s_mov_b32 s49, 0x401c0000 -; GFX10PLUS-NEXT: s_mov_b32 s48, s66 ; GFX10PLUS-NEXT: s_mov_b32 s47, 0x40180000 -; GFX10PLUS-NEXT: s_mov_b32 s46, s66 ; GFX10PLUS-NEXT: s_mov_b32 s45, 0x40140000 -; GFX10PLUS-NEXT: s_mov_b32 s44, s66 ; GFX10PLUS-NEXT: s_mov_b64 s[42:43], 4.0 ; GFX10PLUS-NEXT: s_mov_b32 s41, 0x40080000 -; GFX10PLUS-NEXT: s_mov_b32 s40, s66 ; GFX10PLUS-NEXT: s_mov_b64 s[38:39], 2.0 ; GFX10PLUS-NEXT: s_movrels_b64 s[0:1], s[36:37] ; GFX10PLUS-NEXT: ; return to shader part epilog @@ -3085,10 +3085,10 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GPRIDX-NEXT: ; %bb.0: ; %entry ; GPRIDX-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GPRIDX-NEXT: s_load_dword s8, s[4:5], 0x8 +; GPRIDX-NEXT: s_mov_b32 s4, 0 +; GPRIDX-NEXT: s_mov_b32 s5, 0x40080000 ; GPRIDX-NEXT: s_mov_b32 s2, 0 ; GPRIDX-NEXT: s_mov_b32 s3, 0x40140000 -; GPRIDX-NEXT: s_mov_b32 s5, 0x40080000 -; GPRIDX-NEXT: s_mov_b32 s4, s2 ; GPRIDX-NEXT: s_waitcnt lgkmcnt(0) ; GPRIDX-NEXT: s_cmp_eq_u32 s8, 1 ; GPRIDX-NEXT: s_cselect_b64 s[6:7], 2.0, 1.0 @@ -3176,10 +3176,10 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; MOVREL-NEXT: ; %bb.0: ; %entry ; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; MOVREL-NEXT: s_load_dword s8, s[4:5], 0x8 +; MOVREL-NEXT: s_mov_b32 s4, 0 +; MOVREL-NEXT: s_mov_b32 s5, 0x40080000 ; MOVREL-NEXT: s_mov_b32 s2, 0 ; MOVREL-NEXT: s_mov_b32 s3, 0x40140000 -; MOVREL-NEXT: s_mov_b32 s5, 0x40080000 -; MOVREL-NEXT: s_mov_b32 s4, s2 ; MOVREL-NEXT: s_waitcnt lgkmcnt(0) ; MOVREL-NEXT: s_cmp_eq_u32 s8, 1 ; MOVREL-NEXT: s_cselect_b64 s[6:7], 2.0, 1.0 @@ -3207,7 +3207,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX10-NEXT: kernel_code_entry_byte_offset = 256 ; GFX10-NEXT: kernel_code_prefetch_byte_size = 0 ; GFX10-NEXT: granulated_workitem_vgpr_count = 0 -; GFX10-NEXT: granulated_wavefront_sgpr_count = 1 +; GFX10-NEXT: granulated_wavefront_sgpr_count = 0 ; GFX10-NEXT: priority = 0 ; GFX10-NEXT: float_mode = 240 ; GFX10-NEXT: priv = 0 @@ -3250,7 +3250,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX10-NEXT: gds_segment_byte_size = 0 ; GFX10-NEXT: kernarg_segment_byte_size = 12 ; GFX10-NEXT: workgroup_fbarrier_count = 0 -; GFX10-NEXT: wavefront_sgpr_count = 9 +; GFX10-NEXT: wavefront_sgpr_count = 7 ; GFX10-NEXT: workitem_vgpr_count = 3 ; GFX10-NEXT: reserved_vgpr_first = 0 ; GFX10-NEXT: reserved_vgpr_count = 0 @@ -3267,22 +3267,22 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX10-NEXT: .end_amd_kernel_code_t ; GFX10-NEXT: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s8, s[4:5], 0x8 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x8 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_mov_b32 s2, 0 -; GFX10-NEXT: s_mov_b32 s3, 0x40140000 -; GFX10-NEXT: s_mov_b32 s5, 0x40080000 -; GFX10-NEXT: s_mov_b32 s4, s2 +; GFX10-NEXT: s_mov_b32 s3, 0x40080000 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_eq_u32 s8, 1 -; GFX10-NEXT: s_cselect_b64 s[6:7], 2.0, 1.0 -; GFX10-NEXT: s_cmp_eq_u32 s8, 2 -; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] -; GFX10-NEXT: s_cmp_eq_u32 s8, 3 -; GFX10-NEXT: s_cselect_b64 s[4:5], 4.0, s[4:5] -; GFX10-NEXT: s_cmp_eq_u32 s8, 4 +; GFX10-NEXT: s_cmp_eq_u32 s6, 1 +; GFX10-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0 +; GFX10-NEXT: s_cmp_eq_u32 s6, 2 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] +; GFX10-NEXT: s_cmp_eq_u32 s6, 3 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_mov_b32 s5, 0x40140000 +; GFX10-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3] +; GFX10-NEXT: s_cmp_eq_u32 s6, 4 +; GFX10-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3299,7 +3299,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX11-NEXT: kernel_code_entry_byte_offset = 256 ; GFX11-NEXT: kernel_code_prefetch_byte_size = 0 ; GFX11-NEXT: granulated_workitem_vgpr_count = 0 -; GFX11-NEXT: granulated_wavefront_sgpr_count = 1 +; GFX11-NEXT: granulated_wavefront_sgpr_count = 0 ; GFX11-NEXT: priority = 0 ; GFX11-NEXT: float_mode = 240 ; GFX11-NEXT: priv = 0 @@ -3342,7 +3342,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX11-NEXT: gds_segment_byte_size = 0 ; GFX11-NEXT: kernarg_segment_byte_size = 12 ; GFX11-NEXT: workgroup_fbarrier_count = 0 -; GFX11-NEXT: wavefront_sgpr_count = 9 +; GFX11-NEXT: wavefront_sgpr_count = 7 ; GFX11-NEXT: workitem_vgpr_count = 3 ; GFX11-NEXT: reserved_vgpr_first = 0 ; GFX11-NEXT: reserved_vgpr_count = 0 @@ -3359,22 +3359,22 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX11-NEXT: .end_amd_kernel_code_t ; GFX11-NEXT: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s8, s[0:1], 0x8 +; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x8 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_mov_b32 s3, 0x40140000 -; GFX11-NEXT: s_mov_b32 s5, 0x40080000 -; GFX11-NEXT: s_mov_b32 s4, s2 +; GFX11-NEXT: s_mov_b32 s3, 0x40080000 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_eq_u32 s8, 1 -; GFX11-NEXT: s_cselect_b64 s[6:7], 2.0, 1.0 -; GFX11-NEXT: s_cmp_eq_u32 s8, 2 -; GFX11-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] -; GFX11-NEXT: s_cmp_eq_u32 s8, 3 -; GFX11-NEXT: s_cselect_b64 s[4:5], 4.0, s[4:5] -; GFX11-NEXT: s_cmp_eq_u32 s8, 4 +; GFX11-NEXT: s_cmp_eq_u32 s6, 1 +; GFX11-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0 +; GFX11-NEXT: s_cmp_eq_u32 s6, 2 ; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] +; GFX11-NEXT: s_cmp_eq_u32 s6, 3 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_mov_b32 s5, 0x40140000 +; GFX11-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3] +; GFX11-NEXT: s_cmp_eq_u32 s6, 4 +; GFX11-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 @@ -4784,11 +4784,8 @@ define i32 @v_extract_v64i32_32(ptr addrspace(1) %ptr) { ; MOVREL-LABEL: v_extract_v64i32_32: ; MOVREL: ; %bb.0: ; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; MOVREL-NEXT: s_mov_b64 s[4:5], 0x80 -; MOVREL-NEXT: v_mov_b32_e32 v2, s4 -; MOVREL-NEXT: v_mov_b32_e32 v3, s5 -; MOVREL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; MOVREL-NEXT: v_add_u32_e32 v0, vcc, 0x80, v0 +; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; MOVREL-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; MOVREL-NEXT: s_waitcnt vmcnt(0) ; MOVREL-NEXT: s_setpc_b64 s[30:31] @@ -4823,11 +4820,8 @@ define i32 @v_extract_v64i32_33(ptr addrspace(1) %ptr) { ; MOVREL-LABEL: v_extract_v64i32_33: ; MOVREL: ; %bb.0: ; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; MOVREL-NEXT: s_mov_b64 s[4:5], 0x80 -; MOVREL-NEXT: v_mov_b32_e32 v2, s4 -; MOVREL-NEXT: v_mov_b32_e32 v3, s5 -; MOVREL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; MOVREL-NEXT: v_add_u32_e32 v0, vcc, 0x80, v0 +; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; MOVREL-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; MOVREL-NEXT: s_waitcnt vmcnt(0) ; MOVREL-NEXT: v_mov_b32_e32 v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll index 66bff4a14cac8..c6ea046f95a91 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll @@ -1473,12 +1473,12 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat: ; GFX940: ; %bb.0: ; %main_body ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] sc1 +; GFX940-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_endpgm @@ -1504,12 +1504,12 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 { ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent: ; GFX940: ; %bb.0: ; %main_body ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] +; GFX940-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_endpgm @@ -1549,12 +1549,12 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_system: ; GFX940: ; %bb.0: ; %main_body ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] sc1 +; GFX940-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_endpgm @@ -1748,12 +1748,12 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe: ; GFX940: ; %bb.0: ; %main_body ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] +; GFX940-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll index 139bb40daa930..056629ca35451 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -5824,29 +5824,28 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg %amt) { ; GFX6-LABEL: s_fshl_i128: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b64 s[10:11], 0x7f -; GFX6-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11] -; GFX6-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9] -; GFX6-NEXT: s_sub_i32 s9, s12, 64 -; GFX6-NEXT: s_sub_i32 s10, 64, s12 -; GFX6-NEXT: s_cmp_lt_u32 s12, 64 +; GFX6-NEXT: s_and_b64 s[10:11], s[8:9], 0x7f +; GFX6-NEXT: s_andn2_b64 s[8:9], 0x7f, s[8:9] +; GFX6-NEXT: s_sub_i32 s9, s10, 64 +; GFX6-NEXT: s_sub_i32 s11, 64, s10 +; GFX6-NEXT: s_cmp_lt_u32 s10, 64 +; GFX6-NEXT: s_cselect_b32 s13, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s10, 0 ; GFX6-NEXT: s_cselect_b32 s18, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s12, 0 -; GFX6-NEXT: s_cselect_b32 s19, 1, 0 -; GFX6-NEXT: s_lshl_b64 s[14:15], s[0:1], s12 -; GFX6-NEXT: s_lshr_b64 s[16:17], s[0:1], s10 -; GFX6-NEXT: s_lshl_b64 s[12:13], s[2:3], s12 -; GFX6-NEXT: s_or_b64 s[12:13], s[16:17], s[12:13] +; GFX6-NEXT: s_lshl_b64 s[14:15], s[0:1], s10 +; GFX6-NEXT: s_lshr_b64 s[16:17], s[0:1], s11 +; GFX6-NEXT: s_lshl_b64 s[10:11], s[2:3], s10 +; GFX6-NEXT: s_or_b64 s[10:11], s[16:17], s[10:11] ; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 -; GFX6-NEXT: s_cmp_lg_u32 s18, 0 +; GFX6-NEXT: s_cmp_lg_u32 s13, 0 ; GFX6-NEXT: s_cselect_b64 s[14:15], s[14:15], 0 -; GFX6-NEXT: s_cselect_b64 s[0:1], s[12:13], s[0:1] -; GFX6-NEXT: s_cmp_lg_u32 s19, 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[10:11], s[0:1] +; GFX6-NEXT: s_cmp_lg_u32 s18, 0 +; GFX6-NEXT: s_mov_b32 s12, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX6-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 -; GFX6-NEXT: s_lshl_b32 s5, s6, 31 -; GFX6-NEXT: s_mov_b32 s4, s11 -; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX6-NEXT: s_lshl_b32 s13, s6, 31 +; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[12:13] ; GFX6-NEXT: s_lshr_b64 s[4:5], s[6:7], 1 ; GFX6-NEXT: s_sub_i32 s12, s8, 64 ; GFX6-NEXT: s_sub_i32 s10, 64, s8 @@ -5871,29 +5870,28 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg ; ; GFX8-LABEL: s_fshl_i128: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b64 s[10:11], 0x7f -; GFX8-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11] -; GFX8-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9] -; GFX8-NEXT: s_sub_i32 s9, s12, 64 -; GFX8-NEXT: s_sub_i32 s10, 64, s12 -; GFX8-NEXT: s_cmp_lt_u32 s12, 64 +; GFX8-NEXT: s_and_b64 s[10:11], s[8:9], 0x7f +; GFX8-NEXT: s_andn2_b64 s[8:9], 0x7f, s[8:9] +; GFX8-NEXT: s_sub_i32 s9, s10, 64 +; GFX8-NEXT: s_sub_i32 s11, 64, s10 +; GFX8-NEXT: s_cmp_lt_u32 s10, 64 +; GFX8-NEXT: s_cselect_b32 s13, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s10, 0 ; GFX8-NEXT: s_cselect_b32 s18, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s12, 0 -; GFX8-NEXT: s_cselect_b32 s19, 1, 0 -; GFX8-NEXT: s_lshl_b64 s[14:15], s[0:1], s12 -; GFX8-NEXT: s_lshr_b64 s[16:17], s[0:1], s10 -; GFX8-NEXT: s_lshl_b64 s[12:13], s[2:3], s12 -; GFX8-NEXT: s_or_b64 s[12:13], s[16:17], s[12:13] +; GFX8-NEXT: s_lshl_b64 s[14:15], s[0:1], s10 +; GFX8-NEXT: s_lshr_b64 s[16:17], s[0:1], s11 +; GFX8-NEXT: s_lshl_b64 s[10:11], s[2:3], s10 +; GFX8-NEXT: s_or_b64 s[10:11], s[16:17], s[10:11] ; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 -; GFX8-NEXT: s_cmp_lg_u32 s18, 0 +; GFX8-NEXT: s_cmp_lg_u32 s13, 0 ; GFX8-NEXT: s_cselect_b64 s[14:15], s[14:15], 0 -; GFX8-NEXT: s_cselect_b64 s[0:1], s[12:13], s[0:1] -; GFX8-NEXT: s_cmp_lg_u32 s19, 0 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[10:11], s[0:1] +; GFX8-NEXT: s_cmp_lg_u32 s18, 0 +; GFX8-NEXT: s_mov_b32 s12, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX8-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 -; GFX8-NEXT: s_lshl_b32 s5, s6, 31 -; GFX8-NEXT: s_mov_b32 s4, s11 -; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX8-NEXT: s_lshl_b32 s13, s6, 31 +; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[12:13] ; GFX8-NEXT: s_lshr_b64 s[4:5], s[6:7], 1 ; GFX8-NEXT: s_sub_i32 s12, s8, 64 ; GFX8-NEXT: s_sub_i32 s10, 64, s8 @@ -5918,29 +5916,28 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg ; ; GFX9-LABEL: s_fshl_i128: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[10:11], 0x7f -; GFX9-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11] -; GFX9-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9] -; GFX9-NEXT: s_sub_i32 s9, s12, 64 -; GFX9-NEXT: s_sub_i32 s10, 64, s12 -; GFX9-NEXT: s_cmp_lt_u32 s12, 64 +; GFX9-NEXT: s_and_b64 s[10:11], s[8:9], 0x7f +; GFX9-NEXT: s_andn2_b64 s[8:9], 0x7f, s[8:9] +; GFX9-NEXT: s_sub_i32 s9, s10, 64 +; GFX9-NEXT: s_sub_i32 s11, 64, s10 +; GFX9-NEXT: s_cmp_lt_u32 s10, 64 +; GFX9-NEXT: s_cselect_b32 s13, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s10, 0 ; GFX9-NEXT: s_cselect_b32 s18, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s12, 0 -; GFX9-NEXT: s_cselect_b32 s19, 1, 0 -; GFX9-NEXT: s_lshl_b64 s[14:15], s[0:1], s12 -; GFX9-NEXT: s_lshr_b64 s[16:17], s[0:1], s10 -; GFX9-NEXT: s_lshl_b64 s[12:13], s[2:3], s12 -; GFX9-NEXT: s_or_b64 s[12:13], s[16:17], s[12:13] +; GFX9-NEXT: s_lshl_b64 s[14:15], s[0:1], s10 +; GFX9-NEXT: s_lshr_b64 s[16:17], s[0:1], s11 +; GFX9-NEXT: s_lshl_b64 s[10:11], s[2:3], s10 +; GFX9-NEXT: s_or_b64 s[10:11], s[16:17], s[10:11] ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 -; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cmp_lg_u32 s13, 0 ; GFX9-NEXT: s_cselect_b64 s[14:15], s[14:15], 0 -; GFX9-NEXT: s_cselect_b64 s[0:1], s[12:13], s[0:1] -; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], s[10:11], s[0:1] +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_mov_b32 s12, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 -; GFX9-NEXT: s_lshl_b32 s5, s6, 31 -; GFX9-NEXT: s_mov_b32 s4, s11 -; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX9-NEXT: s_lshl_b32 s13, s6, 31 +; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[12:13] ; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], 1 ; GFX9-NEXT: s_sub_i32 s12, s8, 64 ; GFX9-NEXT: s_sub_i32 s10, 64, s8 @@ -5965,40 +5962,39 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg ; ; GFX10-LABEL: s_fshl_i128: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b64 s[10:11], 0x7f -; GFX10-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11] -; GFX10-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9] -; GFX10-NEXT: s_sub_i32 s9, s12, 64 -; GFX10-NEXT: s_sub_i32 s10, 64, s12 -; GFX10-NEXT: s_cmp_lt_u32 s12, 64 +; GFX10-NEXT: s_and_b64 s[10:11], s[8:9], 0x7f +; GFX10-NEXT: s_andn2_b64 s[8:9], 0x7f, s[8:9] +; GFX10-NEXT: s_sub_i32 s9, s10, 64 +; GFX10-NEXT: s_sub_i32 s11, 64, s10 +; GFX10-NEXT: s_cmp_lt_u32 s10, 64 +; GFX10-NEXT: s_mov_b32 s12, 0 +; GFX10-NEXT: s_cselect_b32 s13, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 s10, 0 ; GFX10-NEXT: s_cselect_b32 s18, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s12, 0 -; GFX10-NEXT: s_cselect_b32 s19, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[14:15], s[0:1], s10 -; GFX10-NEXT: s_lshl_b64 s[16:17], s[2:3], s12 -; GFX10-NEXT: s_lshl_b64 s[12:13], s[0:1], s12 +; GFX10-NEXT: s_lshr_b64 s[14:15], s[0:1], s11 +; GFX10-NEXT: s_lshl_b64 s[16:17], s[2:3], s10 +; GFX10-NEXT: s_lshl_b64 s[10:11], s[0:1], s10 ; GFX10-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 -; GFX10-NEXT: s_cmp_lg_u32 s18, 0 -; GFX10-NEXT: s_cselect_b64 s[12:13], s[12:13], 0 +; GFX10-NEXT: s_cmp_lg_u32 s13, 0 +; GFX10-NEXT: s_cselect_b64 s[10:11], s[10:11], 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[14:15], s[0:1] -; GFX10-NEXT: s_cmp_lg_u32 s19, 0 +; GFX10-NEXT: s_cmp_lg_u32 s18, 0 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX10-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 -; GFX10-NEXT: s_lshl_b32 s5, s6, 31 -; GFX10-NEXT: s_mov_b32 s4, s11 -; GFX10-NEXT: s_sub_i32 s14, s8, 64 -; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX10-NEXT: s_lshl_b32 s13, s6, 31 ; GFX10-NEXT: s_lshr_b64 s[4:5], s[6:7], 1 +; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[12:13] +; GFX10-NEXT: s_sub_i32 s14, s8, 64 ; GFX10-NEXT: s_sub_i32 s9, 64, s8 ; GFX10-NEXT: s_cmp_lt_u32 s8, 64 ; GFX10-NEXT: s_cselect_b32 s15, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s8, 0 ; GFX10-NEXT: s_cselect_b32 s16, 1, 0 ; GFX10-NEXT: s_lshr_b64 s[6:7], s[0:1], s8 -; GFX10-NEXT: s_lshl_b64 s[10:11], s[4:5], s9 +; GFX10-NEXT: s_lshl_b64 s[12:13], s[4:5], s9 ; GFX10-NEXT: s_lshr_b64 s[8:9], s[4:5], s8 -; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] +; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[12:13] ; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s14 ; GFX10-NEXT: s_cmp_lg_u32 s15, 0 ; GFX10-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] @@ -6006,47 +6002,45 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] ; GFX10-NEXT: s_cmp_lg_u32 s15, 0 ; GFX10-NEXT: s_cselect_b64 s[4:5], s[8:9], 0 -; GFX10-NEXT: s_or_b64 s[0:1], s[12:13], s[0:1] +; GFX10-NEXT: s_or_b64 s[0:1], s[10:11], s[0:1] ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshl_i128: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_mov_b64 s[10:11], 0x7f -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11] -; GFX11-NEXT: s_and_not1_b64 s[8:9], s[10:11], s[8:9] -; GFX11-NEXT: s_sub_i32 s9, s12, 64 -; GFX11-NEXT: s_sub_i32 s10, 64, s12 -; GFX11-NEXT: s_cmp_lt_u32 s12, 64 +; GFX11-NEXT: s_and_b64 s[10:11], s[8:9], 0x7f +; GFX11-NEXT: s_and_not1_b64 s[8:9], 0x7f, s[8:9] +; GFX11-NEXT: s_sub_i32 s9, s10, 64 +; GFX11-NEXT: s_sub_i32 s11, 64, s10 +; GFX11-NEXT: s_cmp_lt_u32 s10, 64 +; GFX11-NEXT: s_mov_b32 s12, 0 +; GFX11-NEXT: s_cselect_b32 s13, 1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s10, 0 ; GFX11-NEXT: s_cselect_b32 s18, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s12, 0 -; GFX11-NEXT: s_cselect_b32 s19, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[14:15], s[0:1], s10 -; GFX11-NEXT: s_lshl_b64 s[16:17], s[2:3], s12 -; GFX11-NEXT: s_lshl_b64 s[12:13], s[0:1], s12 +; GFX11-NEXT: s_lshr_b64 s[14:15], s[0:1], s11 +; GFX11-NEXT: s_lshl_b64 s[16:17], s[2:3], s10 +; GFX11-NEXT: s_lshl_b64 s[10:11], s[0:1], s10 ; GFX11-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 -; GFX11-NEXT: s_cmp_lg_u32 s18, 0 -; GFX11-NEXT: s_cselect_b64 s[12:13], s[12:13], 0 +; GFX11-NEXT: s_cmp_lg_u32 s13, 0 +; GFX11-NEXT: s_cselect_b64 s[10:11], s[10:11], 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[14:15], s[0:1] -; GFX11-NEXT: s_cmp_lg_u32 s19, 0 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 ; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX11-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 -; GFX11-NEXT: s_lshl_b32 s5, s6, 31 -; GFX11-NEXT: s_mov_b32 s4, s11 -; GFX11-NEXT: s_sub_i32 s14, s8, 64 -; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX11-NEXT: s_lshl_b32 s13, s6, 31 ; GFX11-NEXT: s_lshr_b64 s[4:5], s[6:7], 1 +; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[12:13] +; GFX11-NEXT: s_sub_i32 s14, s8, 64 ; GFX11-NEXT: s_sub_i32 s9, 64, s8 ; GFX11-NEXT: s_cmp_lt_u32 s8, 64 ; GFX11-NEXT: s_cselect_b32 s15, 1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s8, 0 ; GFX11-NEXT: s_cselect_b32 s16, 1, 0 ; GFX11-NEXT: s_lshr_b64 s[6:7], s[0:1], s8 -; GFX11-NEXT: s_lshl_b64 s[10:11], s[4:5], s9 +; GFX11-NEXT: s_lshl_b64 s[12:13], s[4:5], s9 ; GFX11-NEXT: s_lshr_b64 s[8:9], s[4:5], s8 -; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] +; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[12:13] ; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], s14 ; GFX11-NEXT: s_cmp_lg_u32 s15, 0 ; GFX11-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] @@ -6054,7 +6048,7 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg ; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] ; GFX11-NEXT: s_cmp_lg_u32 s15, 0 ; GFX11-NEXT: s_cselect_b64 s[4:5], s[8:9], 0 -; GFX11-NEXT: s_or_b64 s[0:1], s[12:13], s[0:1] +; GFX11-NEXT: s_or_b64 s[0:1], s[10:11], s[0:1] ; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt) @@ -6575,23 +6569,22 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 inreg %amt) { ; GFX6-LABEL: v_fshl_i128_svs: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b64 s[6:7], 0x7f -; GFX6-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] -; GFX6-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] -; GFX6-NEXT: s_sub_i32 s5, s8, 64 -; GFX6-NEXT: s_sub_i32 s9, 64, s8 -; GFX6-NEXT: s_cmp_lt_u32 s8, 64 +; GFX6-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f +; GFX6-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] +; GFX6-NEXT: s_sub_i32 s5, s6, 64 +; GFX6-NEXT: s_sub_i32 s7, 64, s6 +; GFX6-NEXT: s_cmp_lt_u32 s6, 64 ; GFX6-NEXT: s_cselect_b32 s12, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s8, 0 +; GFX6-NEXT: s_cmp_eq_u32 s6, 0 ; GFX6-NEXT: s_cselect_b32 s13, 1, 0 -; GFX6-NEXT: s_lshl_b64 s[6:7], s[0:1], s8 -; GFX6-NEXT: s_lshr_b64 s[10:11], s[0:1], s9 -; GFX6-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 -; GFX6-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] +; GFX6-NEXT: s_lshl_b64 s[8:9], s[0:1], s6 +; GFX6-NEXT: s_lshr_b64 s[10:11], s[0:1], s7 +; GFX6-NEXT: s_lshl_b64 s[6:7], s[2:3], s6 +; GFX6-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] ; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 ; GFX6-NEXT: s_cmp_lg_u32 s12, 0 -; GFX6-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 -; GFX6-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] +; GFX6-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[6:7], s[0:1] ; GFX6-NEXT: s_cmp_lg_u32 s13, 0 ; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], 1 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] @@ -6605,14 +6598,14 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX6-NEXT: s_cmp_eq_u32 s4, 0 ; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], s4 ; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], s1 -; GFX6-NEXT: s_cselect_b32 s8, 1, 0 +; GFX6-NEXT: s_cselect_b32 s6, 1, 0 ; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], s4 ; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], s0 ; GFX6-NEXT: s_and_b32 s0, 1, s5 ; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX6-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX6-NEXT: s_and_b32 s0, 1, s8 +; GFX6-NEXT: s_and_b32 s0, 1, s6 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 @@ -6620,31 +6613,30 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc -; GFX6-NEXT: v_or_b32_e32 v0, s6, v0 -; GFX6-NEXT: v_or_b32_e32 v1, s7, v1 +; GFX6-NEXT: v_or_b32_e32 v0, s8, v0 +; GFX6-NEXT: v_or_b32_e32 v1, s9, v1 ; GFX6-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX6-NEXT: v_or_b32_e32 v3, s3, v3 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_fshl_i128_svs: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b64 s[6:7], 0x7f -; GFX8-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] -; GFX8-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] -; GFX8-NEXT: s_sub_i32 s5, s8, 64 -; GFX8-NEXT: s_sub_i32 s9, 64, s8 -; GFX8-NEXT: s_cmp_lt_u32 s8, 64 +; GFX8-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f +; GFX8-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] +; GFX8-NEXT: s_sub_i32 s5, s6, 64 +; GFX8-NEXT: s_sub_i32 s7, 64, s6 +; GFX8-NEXT: s_cmp_lt_u32 s6, 64 ; GFX8-NEXT: s_cselect_b32 s12, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s8, 0 +; GFX8-NEXT: s_cmp_eq_u32 s6, 0 ; GFX8-NEXT: s_cselect_b32 s13, 1, 0 -; GFX8-NEXT: s_lshl_b64 s[6:7], s[0:1], s8 -; GFX8-NEXT: s_lshr_b64 s[10:11], s[0:1], s9 -; GFX8-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 -; GFX8-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] +; GFX8-NEXT: s_lshl_b64 s[8:9], s[0:1], s6 +; GFX8-NEXT: s_lshr_b64 s[10:11], s[0:1], s7 +; GFX8-NEXT: s_lshl_b64 s[6:7], s[2:3], s6 +; GFX8-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] ; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 ; GFX8-NEXT: s_cmp_lg_u32 s12, 0 -; GFX8-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 -; GFX8-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] +; GFX8-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[6:7], s[0:1] ; GFX8-NEXT: s_cmp_lg_u32 s13, 0 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1] ; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] @@ -6658,14 +6650,14 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX8-NEXT: s_cmp_eq_u32 s4, 0 ; GFX8-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[6:7], s1, v[2:3] -; GFX8-NEXT: s_cselect_b32 s8, 1, 0 +; GFX8-NEXT: s_cselect_b32 s6, 1, 0 ; GFX8-NEXT: v_lshrrev_b64 v[8:9], s4, v[2:3] ; GFX8-NEXT: v_lshrrev_b64 v[2:3], s0, v[2:3] ; GFX8-NEXT: s_and_b32 s0, 1, s5 ; GFX8-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: s_and_b32 s0, 1, s8 +; GFX8-NEXT: s_and_b32 s0, 1, s6 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 @@ -6673,32 +6665,31 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc -; GFX8-NEXT: v_or_b32_e32 v0, s6, v0 -; GFX8-NEXT: v_or_b32_e32 v1, s7, v1 +; GFX8-NEXT: v_or_b32_e32 v0, s8, v0 +; GFX8-NEXT: v_or_b32_e32 v1, s9, v1 ; GFX8-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX8-NEXT: v_or_b32_e32 v3, s3, v3 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_fshl_i128_svs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f -; GFX9-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] -; GFX9-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] -; GFX9-NEXT: s_sub_i32 s5, s8, 64 -; GFX9-NEXT: s_sub_i32 s9, 64, s8 -; GFX9-NEXT: s_cmp_lt_u32 s8, 64 +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f +; GFX9-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] +; GFX9-NEXT: s_sub_i32 s5, s6, 64 +; GFX9-NEXT: s_sub_i32 s7, 64, s6 +; GFX9-NEXT: s_cmp_lt_u32 s6, 64 ; GFX9-NEXT: s_cselect_b32 s12, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s8, 0 +; GFX9-NEXT: s_cmp_eq_u32 s6, 0 ; GFX9-NEXT: s_cselect_b32 s13, 1, 0 -; GFX9-NEXT: s_lshl_b64 s[6:7], s[0:1], s8 -; GFX9-NEXT: s_lshr_b64 s[10:11], s[0:1], s9 -; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 -; GFX9-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] +; GFX9-NEXT: s_lshl_b64 s[8:9], s[0:1], s6 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[0:1], s7 +; GFX9-NEXT: s_lshl_b64 s[6:7], s[2:3], s6 +; GFX9-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 ; GFX9-NEXT: s_cmp_lg_u32 s12, 0 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1] -; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 -; GFX9-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] +; GFX9-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], s[6:7], s[0:1] ; GFX9-NEXT: s_cmp_lg_u32 s13, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 31, v1 @@ -6710,14 +6701,14 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX9-NEXT: s_cmp_eq_u32 s4, 0 ; GFX9-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[6:7], s1, v[2:3] -; GFX9-NEXT: s_cselect_b32 s8, 1, 0 +; GFX9-NEXT: s_cselect_b32 s6, 1, 0 ; GFX9-NEXT: v_lshrrev_b64 v[8:9], s4, v[2:3] ; GFX9-NEXT: v_lshrrev_b64 v[2:3], s0, v[2:3] ; GFX9-NEXT: s_and_b32 s0, 1, s5 ; GFX9-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX9-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX9-NEXT: s_and_b32 s0, 1, s8 +; GFX9-NEXT: s_and_b32 s0, 1, s6 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 @@ -6725,36 +6716,35 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc -; GFX9-NEXT: v_or_b32_e32 v0, s6, v0 -; GFX9-NEXT: v_or_b32_e32 v1, s7, v1 +; GFX9-NEXT: v_or_b32_e32 v0, s8, v0 +; GFX9-NEXT: v_or_b32_e32 v1, s9, v1 ; GFX9-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX9-NEXT: v_or_b32_e32 v3, s3, v3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_fshl_i128_svs: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b64 s[6:7], 0x7f +; GFX10-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f +; GFX10-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] +; GFX10-NEXT: s_sub_i32 s5, s6, 64 +; GFX10-NEXT: s_sub_i32 s7, 64, s6 +; GFX10-NEXT: s_cmp_lt_u32 s6, 64 ; GFX10-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1] -; GFX10-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] -; GFX10-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] -; GFX10-NEXT: s_sub_i32 s5, s8, 64 -; GFX10-NEXT: s_sub_i32 s6, 64, s8 -; GFX10-NEXT: s_cmp_lt_u32 s8, 64 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 31, v1 ; GFX10-NEXT: s_cselect_b32 s12, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s8, 0 -; GFX10-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] +; GFX10-NEXT: s_cmp_eq_u32 s6, 0 ; GFX10-NEXT: s_cselect_b32 s13, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[6:7], s[0:1], s6 -; GFX10-NEXT: s_lshl_b64 s[10:11], s[2:3], s8 -; GFX10-NEXT: s_lshl_b64 s[8:9], s[0:1], s8 -; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] +; GFX10-NEXT: s_lshr_b64 s[8:9], s[0:1], s7 +; GFX10-NEXT: s_lshl_b64 s[10:11], s[2:3], s6 +; GFX10-NEXT: s_lshl_b64 s[6:7], s[0:1], s6 +; GFX10-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 ; GFX10-NEXT: s_cmp_lg_u32 s12, 0 -; GFX10-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1] -; GFX10-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[6:7], s[0:1] +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 31, v1 +; GFX10-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] +; GFX10-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] ; GFX10-NEXT: s_cmp_lg_u32 s13, 0 +; GFX10-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1] ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX10-NEXT: s_sub_i32 s0, 64, s4 ; GFX10-NEXT: v_lshlrev_b64 v[6:7], s0, v[2:3] @@ -6779,34 +6769,33 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 ; GFX10-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX10-NEXT: v_or_b32_e32 v3, s3, v3 -; GFX10-NEXT: v_or_b32_e32 v0, s8, v0 -; GFX10-NEXT: v_or_b32_e32 v1, s9, v1 +; GFX10-NEXT: v_or_b32_e32 v0, s6, v0 +; GFX10-NEXT: v_or_b32_e32 v1, s7, v1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: v_fshl_i128_svs: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_mov_b64 s[6:7], 0x7f +; GFX11-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f +; GFX11-NEXT: s_and_not1_b64 s[4:5], 0x7f, s[4:5] +; GFX11-NEXT: s_sub_i32 s5, s6, 64 +; GFX11-NEXT: s_sub_i32 s7, 64, s6 +; GFX11-NEXT: s_cmp_lt_u32 s6, 64 ; GFX11-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1] -; GFX11-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] -; GFX11-NEXT: s_and_not1_b64 s[4:5], s[6:7], s[4:5] -; GFX11-NEXT: s_sub_i32 s5, s8, 64 -; GFX11-NEXT: s_sub_i32 s6, 64, s8 -; GFX11-NEXT: s_cmp_lt_u32 s8, 64 -; GFX11-NEXT: v_lshl_or_b32 v1, v2, 31, v1 ; GFX11-NEXT: s_cselect_b32 s12, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s8, 0 -; GFX11-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] +; GFX11-NEXT: s_cmp_eq_u32 s6, 0 ; GFX11-NEXT: s_cselect_b32 s13, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[6:7], s[0:1], s6 -; GFX11-NEXT: s_lshl_b64 s[10:11], s[2:3], s8 -; GFX11-NEXT: s_lshl_b64 s[8:9], s[0:1], s8 -; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] +; GFX11-NEXT: s_lshr_b64 s[8:9], s[0:1], s7 +; GFX11-NEXT: s_lshl_b64 s[10:11], s[2:3], s6 +; GFX11-NEXT: s_lshl_b64 s[6:7], s[0:1], s6 +; GFX11-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 ; GFX11-NEXT: s_cmp_lg_u32 s12, 0 -; GFX11-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1] -; GFX11-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 -; GFX11-NEXT: s_cselect_b64 s[0:1], s[6:7], s[0:1] +; GFX11-NEXT: v_lshl_or_b32 v1, v2, 31, v1 +; GFX11-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] +; GFX11-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 +; GFX11-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] ; GFX11-NEXT: s_cmp_lg_u32 s13, 0 +; GFX11-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1] ; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX11-NEXT: s_sub_i32 s0, 64, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -6833,9 +6822,9 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX11-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v3, s3, v3 -; GFX11-NEXT: v_or_b32_e32 v0, s8, v0 +; GFX11-NEXT: v_or_b32_e32 v0, s6, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v1, s9, v1 +; GFX11-NEXT: v_or_b32_e32 v1, s7, v1 ; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt) %cast.result = bitcast i128 %result to <4 x float> @@ -6845,23 +6834,22 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 inreg %amt) { ; GFX6-LABEL: v_fshl_i128_vss: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b64 s[6:7], 0x7f -; GFX6-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] -; GFX6-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] -; GFX6-NEXT: s_sub_i32 s5, s8, 64 -; GFX6-NEXT: s_sub_i32 s6, 64, s8 -; GFX6-NEXT: s_cmp_lt_u32 s8, 64 +; GFX6-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f +; GFX6-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] +; GFX6-NEXT: s_sub_i32 s5, s6, 64 +; GFX6-NEXT: s_sub_i32 s7, 64, s6 +; GFX6-NEXT: s_cmp_lt_u32 s6, 64 ; GFX6-NEXT: s_cselect_b32 s9, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s8, 0 +; GFX6-NEXT: s_cmp_eq_u32 s6, 0 +; GFX6-NEXT: s_mov_b32 s8, 0 ; GFX6-NEXT: s_cselect_b32 s10, 1, 0 -; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], s6 -; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], s8 -; GFX6-NEXT: v_lshl_b64 v[8:9], v[0:1], s8 +; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], s7 +; GFX6-NEXT: v_lshl_b64 v[8:9], v[0:1], s6 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], s5 ; GFX6-NEXT: s_and_b32 s5, 1, s9 ; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 ; GFX6-NEXT: s_lshl_b32 s9, s2, 31 -; GFX6-NEXT: s_mov_b32 s8, s7 +; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], s6 ; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 ; GFX6-NEXT: s_and_b32 s5, 1, s10 ; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] @@ -6900,23 +6888,22 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; ; GFX8-LABEL: v_fshl_i128_vss: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b64 s[6:7], 0x7f -; GFX8-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] -; GFX8-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] -; GFX8-NEXT: s_sub_i32 s5, s8, 64 -; GFX8-NEXT: s_sub_i32 s6, 64, s8 -; GFX8-NEXT: s_cmp_lt_u32 s8, 64 +; GFX8-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f +; GFX8-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] +; GFX8-NEXT: s_sub_i32 s5, s6, 64 +; GFX8-NEXT: s_sub_i32 s7, 64, s6 +; GFX8-NEXT: s_cmp_lt_u32 s6, 64 ; GFX8-NEXT: s_cselect_b32 s9, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s8, 0 +; GFX8-NEXT: s_cmp_eq_u32 s6, 0 +; GFX8-NEXT: s_mov_b32 s8, 0 ; GFX8-NEXT: s_cselect_b32 s10, 1, 0 -; GFX8-NEXT: v_lshrrev_b64 v[4:5], s6, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[6:7], s8, v[2:3] -; GFX8-NEXT: v_lshlrev_b64 v[8:9], s8, v[0:1] +; GFX8-NEXT: v_lshrrev_b64 v[4:5], s7, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[8:9], s6, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] ; GFX8-NEXT: s_and_b32 s5, 1, s9 ; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 ; GFX8-NEXT: s_lshl_b32 s9, s2, 31 -; GFX8-NEXT: s_mov_b32 s8, s7 +; GFX8-NEXT: v_lshlrev_b64 v[6:7], s6, v[2:3] ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 ; GFX8-NEXT: s_and_b32 s5, 1, s10 ; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] @@ -6955,23 +6942,22 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; ; GFX9-LABEL: v_fshl_i128_vss: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f -; GFX9-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] -; GFX9-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] -; GFX9-NEXT: s_sub_i32 s5, s8, 64 -; GFX9-NEXT: s_sub_i32 s6, 64, s8 -; GFX9-NEXT: s_cmp_lt_u32 s8, 64 +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f +; GFX9-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] +; GFX9-NEXT: s_sub_i32 s5, s6, 64 +; GFX9-NEXT: s_sub_i32 s7, 64, s6 +; GFX9-NEXT: s_cmp_lt_u32 s6, 64 ; GFX9-NEXT: s_cselect_b32 s9, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s8, 0 +; GFX9-NEXT: s_cmp_eq_u32 s6, 0 +; GFX9-NEXT: s_mov_b32 s8, 0 ; GFX9-NEXT: s_cselect_b32 s10, 1, 0 -; GFX9-NEXT: v_lshrrev_b64 v[4:5], s6, v[0:1] -; GFX9-NEXT: v_lshlrev_b64 v[6:7], s8, v[2:3] -; GFX9-NEXT: v_lshlrev_b64 v[8:9], s8, v[0:1] +; GFX9-NEXT: v_lshrrev_b64 v[4:5], s7, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[8:9], s6, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] ; GFX9-NEXT: s_and_b32 s5, 1, s9 ; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 ; GFX9-NEXT: s_lshl_b32 s9, s2, 31 -; GFX9-NEXT: s_mov_b32 s8, s7 +; GFX9-NEXT: v_lshlrev_b64 v[6:7], s6, v[2:3] ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 ; GFX9-NEXT: s_and_b32 s5, 1, s10 ; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] @@ -7010,39 +6996,38 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; ; GFX10-LABEL: v_fshl_i128_vss: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b64 s[6:7], 0x7f -; GFX10-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] -; GFX10-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] -; GFX10-NEXT: s_sub_i32 s5, s8, 64 -; GFX10-NEXT: s_sub_i32 s6, 64, s8 -; GFX10-NEXT: s_cmp_lt_u32 s8, 64 -; GFX10-NEXT: v_lshrrev_b64 v[4:5], s6, v[0:1] -; GFX10-NEXT: v_lshlrev_b64 v[6:7], s8, v[2:3] +; GFX10-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f +; GFX10-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] +; GFX10-NEXT: s_sub_i32 s5, s6, 64 +; GFX10-NEXT: s_sub_i32 s7, 64, s6 +; GFX10-NEXT: s_cmp_lt_u32 s6, 64 +; GFX10-NEXT: v_lshrrev_b64 v[4:5], s7, v[0:1] +; GFX10-NEXT: s_cselect_b32 s8, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 s6, 0 +; GFX10-NEXT: v_lshlrev_b64 v[6:7], s6, v[2:3] ; GFX10-NEXT: s_cselect_b32 s9, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s8, 0 -; GFX10-NEXT: v_lshlrev_b64 v[8:9], s8, v[0:1] -; GFX10-NEXT: s_cselect_b32 s10, 1, 0 -; GFX10-NEXT: s_and_b32 s6, 1, s9 +; GFX10-NEXT: v_lshlrev_b64 v[8:9], s6, v[0:1] +; GFX10-NEXT: s_and_b32 s6, 1, s8 ; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 -; GFX10-NEXT: s_lshl_b32 s9, s2, 31 -; GFX10-NEXT: s_mov_b32 s8, s7 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s6 +; GFX10-NEXT: s_mov_b32 s6, 0 +; GFX10-NEXT: s_lshl_b32 s7, s2, 31 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] -; GFX10-NEXT: s_and_b32 s5, 1, s10 -; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] +; GFX10-NEXT: s_and_b32 s5, 1, s9 +; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] ; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 ; GFX10-NEXT: s_sub_i32 s10, s4, 64 ; GFX10-NEXT: s_sub_i32 s8, 64, s4 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s6 ; GFX10-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX10-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX10-NEXT: s_cmp_lt_u32 s4, 64 +; GFX10-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc_lo ; GFX10-NEXT: s_cselect_b32 s11, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s4, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc_lo ; GFX10-NEXT: s_cselect_b32 s12, 1, 0 ; GFX10-NEXT: s_lshr_b64 s[6:7], s[0:1], s4 ; GFX10-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 -; GFX10-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 @@ -7065,40 +7050,38 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; ; GFX11-LABEL: v_fshl_i128_vss: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_mov_b64 s[6:7], 0x7f -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] -; GFX11-NEXT: s_and_not1_b64 s[4:5], s[6:7], s[4:5] -; GFX11-NEXT: s_sub_i32 s5, s8, 64 -; GFX11-NEXT: s_sub_i32 s6, 64, s8 -; GFX11-NEXT: s_cmp_lt_u32 s8, 64 -; GFX11-NEXT: v_lshrrev_b64 v[4:5], s6, v[0:1] -; GFX11-NEXT: v_lshlrev_b64 v[6:7], s8, v[2:3] +; GFX11-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f +; GFX11-NEXT: s_and_not1_b64 s[4:5], 0x7f, s[4:5] +; GFX11-NEXT: s_sub_i32 s5, s6, 64 +; GFX11-NEXT: s_sub_i32 s7, 64, s6 +; GFX11-NEXT: s_cmp_lt_u32 s6, 64 +; GFX11-NEXT: v_lshrrev_b64 v[4:5], s7, v[0:1] +; GFX11-NEXT: s_cselect_b32 s8, 1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s6, 0 +; GFX11-NEXT: v_lshlrev_b64 v[6:7], s6, v[2:3] ; GFX11-NEXT: s_cselect_b32 s9, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s8, 0 -; GFX11-NEXT: v_lshlrev_b64 v[8:9], s8, v[0:1] -; GFX11-NEXT: s_cselect_b32 s10, 1, 0 -; GFX11-NEXT: s_and_b32 s6, 1, s9 +; GFX11-NEXT: v_lshlrev_b64 v[8:9], s6, v[0:1] +; GFX11-NEXT: s_and_b32 s6, 1, s8 ; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 -; GFX11-NEXT: s_lshl_b32 s9, s2, 31 -; GFX11-NEXT: s_mov_b32 s8, s7 +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s6 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_lshl_b32 s7, s2, 31 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] -; GFX11-NEXT: s_and_b32 s5, 1, s10 -; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] +; GFX11-NEXT: s_and_b32 s5, 1, s9 +; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] ; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 ; GFX11-NEXT: s_sub_i32 s10, s4, 64 ; GFX11-NEXT: s_sub_i32 s8, 64, s4 -; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s6 ; GFX11-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX11-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX11-NEXT: s_cmp_lt_u32 s4, 64 +; GFX11-NEXT: v_dual_cndmask_b32 v6, 0, v8 :: v_dual_cndmask_b32 v7, 0, v9 ; GFX11-NEXT: s_cselect_b32 s11, 1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s4, 0 -; GFX11-NEXT: v_dual_cndmask_b32 v6, 0, v8 :: v_dual_cndmask_b32 v7, 0, v9 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5 ; GFX11-NEXT: s_cselect_b32 s12, 1, 0 ; GFX11-NEXT: s_lshr_b64 s[6:7], s[0:1], s4 ; GFX11-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 -; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5 ; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 ; GFX11-NEXT: s_lshr_b64 s[4:5], s[2:3], s4 ; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] @@ -7243,71 +7226,69 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) { define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs, <2 x i128> inreg %amt) { ; GFX6-LABEL: s_fshl_v2i128: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b64 s[18:19], 0x7f -; GFX6-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19] -; GFX6-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17] -; GFX6-NEXT: s_sub_i32 s17, s22, 64 -; GFX6-NEXT: s_sub_i32 s23, 64, s22 -; GFX6-NEXT: s_cmp_lt_u32 s22, 64 +; GFX6-NEXT: s_and_b64 s[18:19], s[16:17], 0x7f +; GFX6-NEXT: s_andn2_b64 s[16:17], 0x7f, s[16:17] +; GFX6-NEXT: s_sub_i32 s17, s18, 64 +; GFX6-NEXT: s_sub_i32 s19, 64, s18 +; GFX6-NEXT: s_cmp_lt_u32 s18, 64 +; GFX6-NEXT: s_cselect_b32 s23, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s18, 0 ; GFX6-NEXT: s_cselect_b32 s28, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s22, 0 -; GFX6-NEXT: s_cselect_b32 s29, 1, 0 -; GFX6-NEXT: s_lshl_b64 s[24:25], s[0:1], s22 -; GFX6-NEXT: s_lshr_b64 s[26:27], s[0:1], s23 -; GFX6-NEXT: s_lshl_b64 s[22:23], s[2:3], s22 -; GFX6-NEXT: s_or_b64 s[22:23], s[26:27], s[22:23] +; GFX6-NEXT: s_lshl_b64 s[24:25], s[0:1], s18 +; GFX6-NEXT: s_lshr_b64 s[26:27], s[0:1], s19 +; GFX6-NEXT: s_lshl_b64 s[18:19], s[2:3], s18 +; GFX6-NEXT: s_or_b64 s[18:19], s[26:27], s[18:19] ; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s17 -; GFX6-NEXT: s_cmp_lg_u32 s28, 0 +; GFX6-NEXT: s_cmp_lg_u32 s23, 0 ; GFX6-NEXT: s_cselect_b64 s[24:25], s[24:25], 0 -; GFX6-NEXT: s_cselect_b64 s[0:1], s[22:23], s[0:1] -; GFX6-NEXT: s_cmp_lg_u32 s29, 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[18:19], s[0:1] +; GFX6-NEXT: s_cmp_lg_u32 s28, 0 +; GFX6-NEXT: s_mov_b32 s22, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX6-NEXT: s_lshr_b64 s[0:1], s[8:9], 1 -; GFX6-NEXT: s_lshl_b32 s9, s10, 31 -; GFX6-NEXT: s_mov_b32 s8, s19 -; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] +; GFX6-NEXT: s_lshl_b32 s23, s10, 31 +; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[22:23] ; GFX6-NEXT: s_lshr_b64 s[8:9], s[10:11], 1 -; GFX6-NEXT: s_sub_i32 s26, s16, 64 -; GFX6-NEXT: s_sub_i32 s22, 64, s16 +; GFX6-NEXT: s_sub_i32 s23, s16, 64 +; GFX6-NEXT: s_sub_i32 s18, 64, s16 ; GFX6-NEXT: s_cmp_lt_u32 s16, 64 -; GFX6-NEXT: s_cselect_b32 s27, 1, 0 +; GFX6-NEXT: s_cselect_b32 s26, 1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s16, 0 -; GFX6-NEXT: s_cselect_b32 s28, 1, 0 +; GFX6-NEXT: s_cselect_b32 s27, 1, 0 ; GFX6-NEXT: s_lshr_b64 s[10:11], s[8:9], s16 ; GFX6-NEXT: s_lshr_b64 s[16:17], s[0:1], s16 -; GFX6-NEXT: s_lshl_b64 s[22:23], s[8:9], s22 -; GFX6-NEXT: s_or_b64 s[16:17], s[16:17], s[22:23] -; GFX6-NEXT: s_lshr_b64 s[8:9], s[8:9], s26 -; GFX6-NEXT: s_cmp_lg_u32 s27, 0 +; GFX6-NEXT: s_lshl_b64 s[18:19], s[8:9], s18 +; GFX6-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] +; GFX6-NEXT: s_lshr_b64 s[8:9], s[8:9], s23 +; GFX6-NEXT: s_cmp_lg_u32 s26, 0 ; GFX6-NEXT: s_cselect_b64 s[8:9], s[16:17], s[8:9] -; GFX6-NEXT: s_cmp_lg_u32 s28, 0 -; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9] ; GFX6-NEXT: s_cmp_lg_u32 s27, 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9] +; GFX6-NEXT: s_cmp_lg_u32 s26, 0 ; GFX6-NEXT: s_cselect_b64 s[8:9], s[10:11], 0 ; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX6-NEXT: s_and_b64 s[8:9], s[20:21], s[18:19] -; GFX6-NEXT: s_andn2_b64 s[10:11], s[18:19], s[20:21] +; GFX6-NEXT: s_and_b64 s[8:9], s[20:21], 0x7f +; GFX6-NEXT: s_andn2_b64 s[10:11], 0x7f, s[20:21] ; GFX6-NEXT: s_or_b64 s[0:1], s[24:25], s[0:1] ; GFX6-NEXT: s_sub_i32 s11, s8, 64 ; GFX6-NEXT: s_sub_i32 s9, 64, s8 ; GFX6-NEXT: s_cmp_lt_u32 s8, 64 -; GFX6-NEXT: s_cselect_b32 s18, 1, 0 +; GFX6-NEXT: s_cselect_b32 s20, 1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s8, 0 -; GFX6-NEXT: s_cselect_b32 s22, 1, 0 +; GFX6-NEXT: s_cselect_b32 s21, 1, 0 ; GFX6-NEXT: s_lshl_b64 s[16:17], s[4:5], s8 -; GFX6-NEXT: s_lshr_b64 s[20:21], s[4:5], s9 +; GFX6-NEXT: s_lshr_b64 s[18:19], s[4:5], s9 ; GFX6-NEXT: s_lshl_b64 s[8:9], s[6:7], s8 -; GFX6-NEXT: s_or_b64 s[8:9], s[20:21], s[8:9] +; GFX6-NEXT: s_or_b64 s[8:9], s[18:19], s[8:9] ; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s11 -; GFX6-NEXT: s_cmp_lg_u32 s18, 0 +; GFX6-NEXT: s_cmp_lg_u32 s20, 0 ; GFX6-NEXT: s_cselect_b64 s[16:17], s[16:17], 0 ; GFX6-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] -; GFX6-NEXT: s_cmp_lg_u32 s22, 0 +; GFX6-NEXT: s_cmp_lg_u32 s21, 0 ; GFX6-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5] ; GFX6-NEXT: s_lshr_b64 s[4:5], s[12:13], 1 -; GFX6-NEXT: s_lshl_b32 s9, s14, 31 -; GFX6-NEXT: s_mov_b32 s8, s19 -; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GFX6-NEXT: s_lshl_b32 s23, s14, 31 +; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[22:23] ; GFX6-NEXT: s_lshr_b64 s[8:9], s[14:15], 1 ; GFX6-NEXT: s_sub_i32 s18, s10, 64 ; GFX6-NEXT: s_sub_i32 s14, 64, s10 @@ -7332,71 +7313,69 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; ; GFX8-LABEL: s_fshl_v2i128: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b64 s[18:19], 0x7f -; GFX8-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19] -; GFX8-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17] -; GFX8-NEXT: s_sub_i32 s17, s22, 64 -; GFX8-NEXT: s_sub_i32 s23, 64, s22 -; GFX8-NEXT: s_cmp_lt_u32 s22, 64 +; GFX8-NEXT: s_and_b64 s[18:19], s[16:17], 0x7f +; GFX8-NEXT: s_andn2_b64 s[16:17], 0x7f, s[16:17] +; GFX8-NEXT: s_sub_i32 s17, s18, 64 +; GFX8-NEXT: s_sub_i32 s19, 64, s18 +; GFX8-NEXT: s_cmp_lt_u32 s18, 64 +; GFX8-NEXT: s_cselect_b32 s23, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s18, 0 ; GFX8-NEXT: s_cselect_b32 s28, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s22, 0 -; GFX8-NEXT: s_cselect_b32 s29, 1, 0 -; GFX8-NEXT: s_lshl_b64 s[24:25], s[0:1], s22 -; GFX8-NEXT: s_lshr_b64 s[26:27], s[0:1], s23 -; GFX8-NEXT: s_lshl_b64 s[22:23], s[2:3], s22 -; GFX8-NEXT: s_or_b64 s[22:23], s[26:27], s[22:23] +; GFX8-NEXT: s_lshl_b64 s[24:25], s[0:1], s18 +; GFX8-NEXT: s_lshr_b64 s[26:27], s[0:1], s19 +; GFX8-NEXT: s_lshl_b64 s[18:19], s[2:3], s18 +; GFX8-NEXT: s_or_b64 s[18:19], s[26:27], s[18:19] ; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s17 -; GFX8-NEXT: s_cmp_lg_u32 s28, 0 +; GFX8-NEXT: s_cmp_lg_u32 s23, 0 ; GFX8-NEXT: s_cselect_b64 s[24:25], s[24:25], 0 -; GFX8-NEXT: s_cselect_b64 s[0:1], s[22:23], s[0:1] -; GFX8-NEXT: s_cmp_lg_u32 s29, 0 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[18:19], s[0:1] +; GFX8-NEXT: s_cmp_lg_u32 s28, 0 +; GFX8-NEXT: s_mov_b32 s22, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX8-NEXT: s_lshr_b64 s[0:1], s[8:9], 1 -; GFX8-NEXT: s_lshl_b32 s9, s10, 31 -; GFX8-NEXT: s_mov_b32 s8, s19 -; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] +; GFX8-NEXT: s_lshl_b32 s23, s10, 31 +; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[22:23] ; GFX8-NEXT: s_lshr_b64 s[8:9], s[10:11], 1 -; GFX8-NEXT: s_sub_i32 s26, s16, 64 -; GFX8-NEXT: s_sub_i32 s22, 64, s16 +; GFX8-NEXT: s_sub_i32 s23, s16, 64 +; GFX8-NEXT: s_sub_i32 s18, 64, s16 ; GFX8-NEXT: s_cmp_lt_u32 s16, 64 -; GFX8-NEXT: s_cselect_b32 s27, 1, 0 +; GFX8-NEXT: s_cselect_b32 s26, 1, 0 ; GFX8-NEXT: s_cmp_eq_u32 s16, 0 -; GFX8-NEXT: s_cselect_b32 s28, 1, 0 +; GFX8-NEXT: s_cselect_b32 s27, 1, 0 ; GFX8-NEXT: s_lshr_b64 s[10:11], s[8:9], s16 ; GFX8-NEXT: s_lshr_b64 s[16:17], s[0:1], s16 -; GFX8-NEXT: s_lshl_b64 s[22:23], s[8:9], s22 -; GFX8-NEXT: s_or_b64 s[16:17], s[16:17], s[22:23] -; GFX8-NEXT: s_lshr_b64 s[8:9], s[8:9], s26 -; GFX8-NEXT: s_cmp_lg_u32 s27, 0 +; GFX8-NEXT: s_lshl_b64 s[18:19], s[8:9], s18 +; GFX8-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] +; GFX8-NEXT: s_lshr_b64 s[8:9], s[8:9], s23 +; GFX8-NEXT: s_cmp_lg_u32 s26, 0 ; GFX8-NEXT: s_cselect_b64 s[8:9], s[16:17], s[8:9] -; GFX8-NEXT: s_cmp_lg_u32 s28, 0 -; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9] ; GFX8-NEXT: s_cmp_lg_u32 s27, 0 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9] +; GFX8-NEXT: s_cmp_lg_u32 s26, 0 ; GFX8-NEXT: s_cselect_b64 s[8:9], s[10:11], 0 ; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX8-NEXT: s_and_b64 s[8:9], s[20:21], s[18:19] -; GFX8-NEXT: s_andn2_b64 s[10:11], s[18:19], s[20:21] +; GFX8-NEXT: s_and_b64 s[8:9], s[20:21], 0x7f +; GFX8-NEXT: s_andn2_b64 s[10:11], 0x7f, s[20:21] ; GFX8-NEXT: s_or_b64 s[0:1], s[24:25], s[0:1] ; GFX8-NEXT: s_sub_i32 s11, s8, 64 ; GFX8-NEXT: s_sub_i32 s9, 64, s8 ; GFX8-NEXT: s_cmp_lt_u32 s8, 64 -; GFX8-NEXT: s_cselect_b32 s18, 1, 0 +; GFX8-NEXT: s_cselect_b32 s20, 1, 0 ; GFX8-NEXT: s_cmp_eq_u32 s8, 0 -; GFX8-NEXT: s_cselect_b32 s22, 1, 0 +; GFX8-NEXT: s_cselect_b32 s21, 1, 0 ; GFX8-NEXT: s_lshl_b64 s[16:17], s[4:5], s8 -; GFX8-NEXT: s_lshr_b64 s[20:21], s[4:5], s9 +; GFX8-NEXT: s_lshr_b64 s[18:19], s[4:5], s9 ; GFX8-NEXT: s_lshl_b64 s[8:9], s[6:7], s8 -; GFX8-NEXT: s_or_b64 s[8:9], s[20:21], s[8:9] +; GFX8-NEXT: s_or_b64 s[8:9], s[18:19], s[8:9] ; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], s11 -; GFX8-NEXT: s_cmp_lg_u32 s18, 0 +; GFX8-NEXT: s_cmp_lg_u32 s20, 0 ; GFX8-NEXT: s_cselect_b64 s[16:17], s[16:17], 0 ; GFX8-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] -; GFX8-NEXT: s_cmp_lg_u32 s22, 0 +; GFX8-NEXT: s_cmp_lg_u32 s21, 0 ; GFX8-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5] ; GFX8-NEXT: s_lshr_b64 s[4:5], s[12:13], 1 -; GFX8-NEXT: s_lshl_b32 s9, s14, 31 -; GFX8-NEXT: s_mov_b32 s8, s19 -; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GFX8-NEXT: s_lshl_b32 s23, s14, 31 +; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[22:23] ; GFX8-NEXT: s_lshr_b64 s[8:9], s[14:15], 1 ; GFX8-NEXT: s_sub_i32 s18, s10, 64 ; GFX8-NEXT: s_sub_i32 s14, 64, s10 @@ -7421,71 +7400,69 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; ; GFX9-LABEL: s_fshl_v2i128: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[18:19], 0x7f -; GFX9-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19] -; GFX9-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17] -; GFX9-NEXT: s_sub_i32 s17, s22, 64 -; GFX9-NEXT: s_sub_i32 s23, 64, s22 -; GFX9-NEXT: s_cmp_lt_u32 s22, 64 +; GFX9-NEXT: s_and_b64 s[18:19], s[16:17], 0x7f +; GFX9-NEXT: s_andn2_b64 s[16:17], 0x7f, s[16:17] +; GFX9-NEXT: s_sub_i32 s17, s18, 64 +; GFX9-NEXT: s_sub_i32 s19, 64, s18 +; GFX9-NEXT: s_cmp_lt_u32 s18, 64 +; GFX9-NEXT: s_cselect_b32 s23, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s18, 0 ; GFX9-NEXT: s_cselect_b32 s28, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s22, 0 -; GFX9-NEXT: s_cselect_b32 s29, 1, 0 -; GFX9-NEXT: s_lshl_b64 s[24:25], s[0:1], s22 -; GFX9-NEXT: s_lshr_b64 s[26:27], s[0:1], s23 -; GFX9-NEXT: s_lshl_b64 s[22:23], s[2:3], s22 -; GFX9-NEXT: s_or_b64 s[22:23], s[26:27], s[22:23] +; GFX9-NEXT: s_lshl_b64 s[24:25], s[0:1], s18 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[0:1], s19 +; GFX9-NEXT: s_lshl_b64 s[18:19], s[2:3], s18 +; GFX9-NEXT: s_or_b64 s[18:19], s[26:27], s[18:19] ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s17 -; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cmp_lg_u32 s23, 0 ; GFX9-NEXT: s_cselect_b64 s[24:25], s[24:25], 0 -; GFX9-NEXT: s_cselect_b64 s[0:1], s[22:23], s[0:1] -; GFX9-NEXT: s_cmp_lg_u32 s29, 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], s[18:19], s[0:1] +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_mov_b32 s22, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX9-NEXT: s_lshr_b64 s[0:1], s[8:9], 1 -; GFX9-NEXT: s_lshl_b32 s9, s10, 31 -; GFX9-NEXT: s_mov_b32 s8, s19 -; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] +; GFX9-NEXT: s_lshl_b32 s23, s10, 31 +; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[22:23] ; GFX9-NEXT: s_lshr_b64 s[8:9], s[10:11], 1 -; GFX9-NEXT: s_sub_i32 s26, s16, 64 -; GFX9-NEXT: s_sub_i32 s22, 64, s16 +; GFX9-NEXT: s_sub_i32 s23, s16, 64 +; GFX9-NEXT: s_sub_i32 s18, 64, s16 ; GFX9-NEXT: s_cmp_lt_u32 s16, 64 -; GFX9-NEXT: s_cselect_b32 s27, 1, 0 +; GFX9-NEXT: s_cselect_b32 s26, 1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s16, 0 -; GFX9-NEXT: s_cselect_b32 s28, 1, 0 +; GFX9-NEXT: s_cselect_b32 s27, 1, 0 ; GFX9-NEXT: s_lshr_b64 s[10:11], s[8:9], s16 ; GFX9-NEXT: s_lshr_b64 s[16:17], s[0:1], s16 -; GFX9-NEXT: s_lshl_b64 s[22:23], s[8:9], s22 -; GFX9-NEXT: s_or_b64 s[16:17], s[16:17], s[22:23] -; GFX9-NEXT: s_lshr_b64 s[8:9], s[8:9], s26 -; GFX9-NEXT: s_cmp_lg_u32 s27, 0 +; GFX9-NEXT: s_lshl_b64 s[18:19], s[8:9], s18 +; GFX9-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] +; GFX9-NEXT: s_lshr_b64 s[8:9], s[8:9], s23 +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 ; GFX9-NEXT: s_cselect_b64 s[8:9], s[16:17], s[8:9] -; GFX9-NEXT: s_cmp_lg_u32 s28, 0 -; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9] ; GFX9-NEXT: s_cmp_lg_u32 s27, 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9] +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 ; GFX9-NEXT: s_cselect_b64 s[8:9], s[10:11], 0 ; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX9-NEXT: s_and_b64 s[8:9], s[20:21], s[18:19] -; GFX9-NEXT: s_andn2_b64 s[10:11], s[18:19], s[20:21] +; GFX9-NEXT: s_and_b64 s[8:9], s[20:21], 0x7f +; GFX9-NEXT: s_andn2_b64 s[10:11], 0x7f, s[20:21] ; GFX9-NEXT: s_or_b64 s[0:1], s[24:25], s[0:1] ; GFX9-NEXT: s_sub_i32 s11, s8, 64 ; GFX9-NEXT: s_sub_i32 s9, 64, s8 ; GFX9-NEXT: s_cmp_lt_u32 s8, 64 -; GFX9-NEXT: s_cselect_b32 s18, 1, 0 +; GFX9-NEXT: s_cselect_b32 s20, 1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s8, 0 -; GFX9-NEXT: s_cselect_b32 s22, 1, 0 +; GFX9-NEXT: s_cselect_b32 s21, 1, 0 ; GFX9-NEXT: s_lshl_b64 s[16:17], s[4:5], s8 -; GFX9-NEXT: s_lshr_b64 s[20:21], s[4:5], s9 +; GFX9-NEXT: s_lshr_b64 s[18:19], s[4:5], s9 ; GFX9-NEXT: s_lshl_b64 s[8:9], s[6:7], s8 -; GFX9-NEXT: s_or_b64 s[8:9], s[20:21], s[8:9] +; GFX9-NEXT: s_or_b64 s[8:9], s[18:19], s[8:9] ; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], s11 -; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 ; GFX9-NEXT: s_cselect_b64 s[16:17], s[16:17], 0 ; GFX9-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] -; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cmp_lg_u32 s21, 0 ; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5] ; GFX9-NEXT: s_lshr_b64 s[4:5], s[12:13], 1 -; GFX9-NEXT: s_lshl_b32 s9, s14, 31 -; GFX9-NEXT: s_mov_b32 s8, s19 -; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GFX9-NEXT: s_lshl_b32 s23, s14, 31 +; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], s[22:23] ; GFX9-NEXT: s_lshr_b64 s[8:9], s[14:15], 1 ; GFX9-NEXT: s_sub_i32 s18, s10, 64 ; GFX9-NEXT: s_sub_i32 s14, 64, s10 @@ -7510,73 +7487,71 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; ; GFX10-LABEL: s_fshl_v2i128: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b64 s[18:19], 0x7f -; GFX10-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19] -; GFX10-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17] -; GFX10-NEXT: s_sub_i32 s17, s22, 64 -; GFX10-NEXT: s_sub_i32 s23, 64, s22 -; GFX10-NEXT: s_cmp_lt_u32 s22, 64 +; GFX10-NEXT: s_and_b64 s[18:19], s[16:17], 0x7f +; GFX10-NEXT: s_andn2_b64 s[16:17], 0x7f, s[16:17] +; GFX10-NEXT: s_sub_i32 s17, s18, 64 +; GFX10-NEXT: s_sub_i32 s19, 64, s18 +; GFX10-NEXT: s_cmp_lt_u32 s18, 64 +; GFX10-NEXT: s_mov_b32 s22, 0 +; GFX10-NEXT: s_cselect_b32 s23, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 s18, 0 ; GFX10-NEXT: s_cselect_b32 s28, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s22, 0 -; GFX10-NEXT: s_cselect_b32 s29, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[24:25], s[0:1], s23 -; GFX10-NEXT: s_lshl_b64 s[26:27], s[2:3], s22 -; GFX10-NEXT: s_lshl_b64 s[22:23], s[0:1], s22 +; GFX10-NEXT: s_lshr_b64 s[24:25], s[0:1], s19 +; GFX10-NEXT: s_lshl_b64 s[26:27], s[2:3], s18 +; GFX10-NEXT: s_lshl_b64 s[18:19], s[0:1], s18 ; GFX10-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s17 -; GFX10-NEXT: s_cmp_lg_u32 s28, 0 -; GFX10-NEXT: s_cselect_b64 s[22:23], s[22:23], 0 +; GFX10-NEXT: s_cmp_lg_u32 s23, 0 +; GFX10-NEXT: s_cselect_b64 s[18:19], s[18:19], 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[24:25], s[0:1] -; GFX10-NEXT: s_cmp_lg_u32 s29, 0 +; GFX10-NEXT: s_cmp_lg_u32 s28, 0 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], 1 -; GFX10-NEXT: s_lshl_b32 s9, s10, 31 -; GFX10-NEXT: s_mov_b32 s8, s19 -; GFX10-NEXT: s_sub_i32 s26, s16, 64 -; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] +; GFX10-NEXT: s_lshl_b32 s23, s10, 31 ; GFX10-NEXT: s_lshr_b64 s[8:9], s[10:11], 1 +; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[22:23] +; GFX10-NEXT: s_sub_i32 s23, s16, 64 ; GFX10-NEXT: s_sub_i32 s17, 64, s16 ; GFX10-NEXT: s_cmp_lt_u32 s16, 64 -; GFX10-NEXT: s_cselect_b32 s27, 1, 0 +; GFX10-NEXT: s_cselect_b32 s26, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s16, 0 -; GFX10-NEXT: s_cselect_b32 s28, 1, 0 +; GFX10-NEXT: s_cselect_b32 s27, 1, 0 ; GFX10-NEXT: s_lshr_b64 s[10:11], s[0:1], s16 ; GFX10-NEXT: s_lshl_b64 s[24:25], s[8:9], s17 ; GFX10-NEXT: s_lshr_b64 s[16:17], s[8:9], s16 ; GFX10-NEXT: s_or_b64 s[10:11], s[10:11], s[24:25] -; GFX10-NEXT: s_lshr_b64 s[8:9], s[8:9], s26 -; GFX10-NEXT: s_cmp_lg_u32 s27, 0 +; GFX10-NEXT: s_lshr_b64 s[8:9], s[8:9], s23 +; GFX10-NEXT: s_cmp_lg_u32 s26, 0 ; GFX10-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9] -; GFX10-NEXT: s_cmp_lg_u32 s28, 0 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9] ; GFX10-NEXT: s_cmp_lg_u32 s27, 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9] +; GFX10-NEXT: s_cmp_lg_u32 s26, 0 ; GFX10-NEXT: s_cselect_b64 s[8:9], s[16:17], 0 -; GFX10-NEXT: s_andn2_b64 s[10:11], s[18:19], s[20:21] +; GFX10-NEXT: s_andn2_b64 s[10:11], 0x7f, s[20:21] ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX10-NEXT: s_and_b64 s[8:9], s[20:21], s[18:19] -; GFX10-NEXT: s_or_b64 s[0:1], s[22:23], s[0:1] +; GFX10-NEXT: s_and_b64 s[8:9], s[20:21], 0x7f +; GFX10-NEXT: s_or_b64 s[0:1], s[18:19], s[0:1] ; GFX10-NEXT: s_sub_i32 s11, s8, 64 ; GFX10-NEXT: s_sub_i32 s9, 64, s8 ; GFX10-NEXT: s_cmp_lt_u32 s8, 64 -; GFX10-NEXT: s_cselect_b32 s18, 1, 0 +; GFX10-NEXT: s_cselect_b32 s20, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s8, 0 -; GFX10-NEXT: s_cselect_b32 s22, 1, 0 +; GFX10-NEXT: s_cselect_b32 s21, 1, 0 ; GFX10-NEXT: s_lshr_b64 s[16:17], s[4:5], s9 -; GFX10-NEXT: s_lshl_b64 s[20:21], s[6:7], s8 +; GFX10-NEXT: s_lshl_b64 s[18:19], s[6:7], s8 ; GFX10-NEXT: s_lshl_b64 s[8:9], s[4:5], s8 -; GFX10-NEXT: s_or_b64 s[16:17], s[16:17], s[20:21] +; GFX10-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] ; GFX10-NEXT: s_lshl_b64 s[4:5], s[4:5], s11 -; GFX10-NEXT: s_cmp_lg_u32 s18, 0 +; GFX10-NEXT: s_cmp_lg_u32 s20, 0 ; GFX10-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 ; GFX10-NEXT: s_cselect_b64 s[4:5], s[16:17], s[4:5] -; GFX10-NEXT: s_cmp_lg_u32 s22, 0 +; GFX10-NEXT: s_cmp_lg_u32 s21, 0 ; GFX10-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5] ; GFX10-NEXT: s_lshr_b64 s[4:5], s[12:13], 1 -; GFX10-NEXT: s_lshl_b32 s13, s14, 31 -; GFX10-NEXT: s_mov_b32 s12, s19 -; GFX10-NEXT: s_sub_i32 s18, s10, 64 -; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[12:13] +; GFX10-NEXT: s_lshl_b32 s23, s14, 31 ; GFX10-NEXT: s_lshr_b64 s[12:13], s[14:15], 1 +; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[22:23] +; GFX10-NEXT: s_sub_i32 s18, s10, 64 ; GFX10-NEXT: s_sub_i32 s11, 64, s10 ; GFX10-NEXT: s_cmp_lt_u32 s10, 64 ; GFX10-NEXT: s_cselect_b32 s19, 1, 0 @@ -7599,74 +7574,71 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; ; GFX11-LABEL: s_fshl_v2i128: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_mov_b64 s[18:19], 0x7f -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19] -; GFX11-NEXT: s_and_not1_b64 s[16:17], s[18:19], s[16:17] -; GFX11-NEXT: s_sub_i32 s17, s22, 64 -; GFX11-NEXT: s_sub_i32 s23, 64, s22 -; GFX11-NEXT: s_cmp_lt_u32 s22, 64 +; GFX11-NEXT: s_and_b64 s[18:19], s[16:17], 0x7f +; GFX11-NEXT: s_and_not1_b64 s[16:17], 0x7f, s[16:17] +; GFX11-NEXT: s_sub_i32 s17, s18, 64 +; GFX11-NEXT: s_sub_i32 s19, 64, s18 +; GFX11-NEXT: s_cmp_lt_u32 s18, 64 +; GFX11-NEXT: s_mov_b32 s22, 0 +; GFX11-NEXT: s_cselect_b32 s23, 1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s18, 0 ; GFX11-NEXT: s_cselect_b32 s28, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s22, 0 -; GFX11-NEXT: s_cselect_b32 s29, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[24:25], s[0:1], s23 -; GFX11-NEXT: s_lshl_b64 s[26:27], s[2:3], s22 -; GFX11-NEXT: s_lshl_b64 s[22:23], s[0:1], s22 +; GFX11-NEXT: s_lshr_b64 s[24:25], s[0:1], s19 +; GFX11-NEXT: s_lshl_b64 s[26:27], s[2:3], s18 +; GFX11-NEXT: s_lshl_b64 s[18:19], s[0:1], s18 ; GFX11-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s17 -; GFX11-NEXT: s_cmp_lg_u32 s28, 0 -; GFX11-NEXT: s_cselect_b64 s[22:23], s[22:23], 0 +; GFX11-NEXT: s_cmp_lg_u32 s23, 0 +; GFX11-NEXT: s_cselect_b64 s[18:19], s[18:19], 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[24:25], s[0:1] -; GFX11-NEXT: s_cmp_lg_u32 s29, 0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 ; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX11-NEXT: s_lshr_b64 s[0:1], s[8:9], 1 -; GFX11-NEXT: s_lshl_b32 s9, s10, 31 -; GFX11-NEXT: s_mov_b32 s8, s19 -; GFX11-NEXT: s_sub_i32 s26, s16, 64 -; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] +; GFX11-NEXT: s_lshl_b32 s23, s10, 31 ; GFX11-NEXT: s_lshr_b64 s[8:9], s[10:11], 1 +; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[22:23] +; GFX11-NEXT: s_sub_i32 s23, s16, 64 ; GFX11-NEXT: s_sub_i32 s17, 64, s16 ; GFX11-NEXT: s_cmp_lt_u32 s16, 64 -; GFX11-NEXT: s_cselect_b32 s27, 1, 0 +; GFX11-NEXT: s_cselect_b32 s26, 1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s16, 0 -; GFX11-NEXT: s_cselect_b32 s28, 1, 0 +; GFX11-NEXT: s_cselect_b32 s27, 1, 0 ; GFX11-NEXT: s_lshr_b64 s[10:11], s[0:1], s16 ; GFX11-NEXT: s_lshl_b64 s[24:25], s[8:9], s17 ; GFX11-NEXT: s_lshr_b64 s[16:17], s[8:9], s16 ; GFX11-NEXT: s_or_b64 s[10:11], s[10:11], s[24:25] -; GFX11-NEXT: s_lshr_b64 s[8:9], s[8:9], s26 -; GFX11-NEXT: s_cmp_lg_u32 s27, 0 +; GFX11-NEXT: s_lshr_b64 s[8:9], s[8:9], s23 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 ; GFX11-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9] -; GFX11-NEXT: s_cmp_lg_u32 s28, 0 -; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9] ; GFX11-NEXT: s_cmp_lg_u32 s27, 0 +; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9] +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 ; GFX11-NEXT: s_cselect_b64 s[8:9], s[16:17], 0 -; GFX11-NEXT: s_and_not1_b64 s[10:11], s[18:19], s[20:21] +; GFX11-NEXT: s_and_not1_b64 s[10:11], 0x7f, s[20:21] ; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX11-NEXT: s_and_b64 s[8:9], s[20:21], s[18:19] -; GFX11-NEXT: s_or_b64 s[0:1], s[22:23], s[0:1] +; GFX11-NEXT: s_and_b64 s[8:9], s[20:21], 0x7f +; GFX11-NEXT: s_or_b64 s[0:1], s[18:19], s[0:1] ; GFX11-NEXT: s_sub_i32 s11, s8, 64 ; GFX11-NEXT: s_sub_i32 s9, 64, s8 ; GFX11-NEXT: s_cmp_lt_u32 s8, 64 -; GFX11-NEXT: s_cselect_b32 s18, 1, 0 +; GFX11-NEXT: s_cselect_b32 s20, 1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s8, 0 -; GFX11-NEXT: s_cselect_b32 s22, 1, 0 +; GFX11-NEXT: s_cselect_b32 s21, 1, 0 ; GFX11-NEXT: s_lshr_b64 s[16:17], s[4:5], s9 -; GFX11-NEXT: s_lshl_b64 s[20:21], s[6:7], s8 +; GFX11-NEXT: s_lshl_b64 s[18:19], s[6:7], s8 ; GFX11-NEXT: s_lshl_b64 s[8:9], s[4:5], s8 -; GFX11-NEXT: s_or_b64 s[16:17], s[16:17], s[20:21] +; GFX11-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] ; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s11 -; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 ; GFX11-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 ; GFX11-NEXT: s_cselect_b64 s[4:5], s[16:17], s[4:5] -; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_cmp_lg_u32 s21, 0 ; GFX11-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5] ; GFX11-NEXT: s_lshr_b64 s[4:5], s[12:13], 1 -; GFX11-NEXT: s_lshl_b32 s13, s14, 31 -; GFX11-NEXT: s_mov_b32 s12, s19 -; GFX11-NEXT: s_sub_i32 s18, s10, 64 -; GFX11-NEXT: s_or_b64 s[4:5], s[4:5], s[12:13] +; GFX11-NEXT: s_lshl_b32 s23, s14, 31 ; GFX11-NEXT: s_lshr_b64 s[12:13], s[14:15], 1 +; GFX11-NEXT: s_or_b64 s[4:5], s[4:5], s[22:23] +; GFX11-NEXT: s_sub_i32 s18, s10, 64 ; GFX11-NEXT: s_sub_i32 s11, 64, s10 ; GFX11-NEXT: s_cmp_lt_u32 s10, 64 ; GFX11-NEXT: s_cselect_b32 s19, 1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll index 25d845f2f9922..88fa7a8406475 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -5878,39 +5878,38 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg %amt) { ; GFX6-LABEL: s_fshr_i128: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_movk_i32 s10, 0x7f -; GFX6-NEXT: s_mov_b32 s11, 0 -; GFX6-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11] -; GFX6-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9] +; GFX6-NEXT: s_and_b64 s[10:11], s[8:9], 0x7f +; GFX6-NEXT: s_andn2_b64 s[8:9], 0x7f, s[8:9] +; GFX6-NEXT: s_lshl_b64 s[12:13], s[0:1], 1 ; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX6-NEXT: s_lshr_b32 s10, s1, 31 -; GFX6-NEXT: s_lshl_b64 s[14:15], s[0:1], 1 -; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[10:11] -; GFX6-NEXT: s_sub_i32 s13, s8, 64 +; GFX6-NEXT: s_lshr_b32 s0, s1, 31 +; GFX6-NEXT: s_mov_b32 s1, 0 +; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX6-NEXT: s_sub_i32 s11, s8, 64 ; GFX6-NEXT: s_sub_i32 s9, 64, s8 ; GFX6-NEXT: s_cmp_lt_u32 s8, 64 ; GFX6-NEXT: s_cselect_b32 s16, 1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s8, 0 ; GFX6-NEXT: s_cselect_b32 s17, 1, 0 -; GFX6-NEXT: s_lshl_b64 s[2:3], s[14:15], s8 -; GFX6-NEXT: s_lshr_b64 s[10:11], s[14:15], s9 +; GFX6-NEXT: s_lshl_b64 s[2:3], s[12:13], s8 +; GFX6-NEXT: s_lshr_b64 s[14:15], s[12:13], s9 ; GFX6-NEXT: s_lshl_b64 s[8:9], s[0:1], s8 -; GFX6-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] -; GFX6-NEXT: s_lshl_b64 s[10:11], s[14:15], s13 +; GFX6-NEXT: s_or_b64 s[8:9], s[14:15], s[8:9] +; GFX6-NEXT: s_lshl_b64 s[12:13], s[12:13], s11 ; GFX6-NEXT: s_cmp_lg_u32 s16, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX6-NEXT: s_cselect_b64 s[8:9], s[8:9], s[10:11] +; GFX6-NEXT: s_cselect_b64 s[8:9], s[8:9], s[12:13] ; GFX6-NEXT: s_cmp_lg_u32 s17, 0 ; GFX6-NEXT: s_cselect_b64 s[8:9], s[0:1], s[8:9] -; GFX6-NEXT: s_sub_i32 s14, s12, 64 -; GFX6-NEXT: s_sub_i32 s13, 64, s12 -; GFX6-NEXT: s_cmp_lt_u32 s12, 64 +; GFX6-NEXT: s_sub_i32 s14, s10, 64 +; GFX6-NEXT: s_sub_i32 s12, 64, s10 +; GFX6-NEXT: s_cmp_lt_u32 s10, 64 ; GFX6-NEXT: s_cselect_b32 s15, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s12, 0 +; GFX6-NEXT: s_cmp_eq_u32 s10, 0 ; GFX6-NEXT: s_cselect_b32 s16, 1, 0 -; GFX6-NEXT: s_lshr_b64 s[0:1], s[6:7], s12 -; GFX6-NEXT: s_lshr_b64 s[10:11], s[4:5], s12 -; GFX6-NEXT: s_lshl_b64 s[12:13], s[6:7], s13 +; GFX6-NEXT: s_lshr_b64 s[0:1], s[6:7], s10 +; GFX6-NEXT: s_lshr_b64 s[10:11], s[4:5], s10 +; GFX6-NEXT: s_lshl_b64 s[12:13], s[6:7], s12 ; GFX6-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] ; GFX6-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 ; GFX6-NEXT: s_cmp_lg_u32 s15, 0 @@ -5925,39 +5924,38 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg ; ; GFX8-LABEL: s_fshr_i128: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_movk_i32 s10, 0x7f -; GFX8-NEXT: s_mov_b32 s11, 0 -; GFX8-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11] -; GFX8-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9] +; GFX8-NEXT: s_and_b64 s[10:11], s[8:9], 0x7f +; GFX8-NEXT: s_andn2_b64 s[8:9], 0x7f, s[8:9] +; GFX8-NEXT: s_lshl_b64 s[12:13], s[0:1], 1 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX8-NEXT: s_lshr_b32 s10, s1, 31 -; GFX8-NEXT: s_lshl_b64 s[14:15], s[0:1], 1 -; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[10:11] -; GFX8-NEXT: s_sub_i32 s13, s8, 64 +; GFX8-NEXT: s_lshr_b32 s0, s1, 31 +; GFX8-NEXT: s_mov_b32 s1, 0 +; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX8-NEXT: s_sub_i32 s11, s8, 64 ; GFX8-NEXT: s_sub_i32 s9, 64, s8 ; GFX8-NEXT: s_cmp_lt_u32 s8, 64 ; GFX8-NEXT: s_cselect_b32 s16, 1, 0 ; GFX8-NEXT: s_cmp_eq_u32 s8, 0 ; GFX8-NEXT: s_cselect_b32 s17, 1, 0 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[14:15], s8 -; GFX8-NEXT: s_lshr_b64 s[10:11], s[14:15], s9 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[12:13], s8 +; GFX8-NEXT: s_lshr_b64 s[14:15], s[12:13], s9 ; GFX8-NEXT: s_lshl_b64 s[8:9], s[0:1], s8 -; GFX8-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] -; GFX8-NEXT: s_lshl_b64 s[10:11], s[14:15], s13 +; GFX8-NEXT: s_or_b64 s[8:9], s[14:15], s[8:9] +; GFX8-NEXT: s_lshl_b64 s[12:13], s[12:13], s11 ; GFX8-NEXT: s_cmp_lg_u32 s16, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX8-NEXT: s_cselect_b64 s[8:9], s[8:9], s[10:11] +; GFX8-NEXT: s_cselect_b64 s[8:9], s[8:9], s[12:13] ; GFX8-NEXT: s_cmp_lg_u32 s17, 0 ; GFX8-NEXT: s_cselect_b64 s[8:9], s[0:1], s[8:9] -; GFX8-NEXT: s_sub_i32 s14, s12, 64 -; GFX8-NEXT: s_sub_i32 s13, 64, s12 -; GFX8-NEXT: s_cmp_lt_u32 s12, 64 +; GFX8-NEXT: s_sub_i32 s14, s10, 64 +; GFX8-NEXT: s_sub_i32 s12, 64, s10 +; GFX8-NEXT: s_cmp_lt_u32 s10, 64 ; GFX8-NEXT: s_cselect_b32 s15, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s12, 0 +; GFX8-NEXT: s_cmp_eq_u32 s10, 0 ; GFX8-NEXT: s_cselect_b32 s16, 1, 0 -; GFX8-NEXT: s_lshr_b64 s[0:1], s[6:7], s12 -; GFX8-NEXT: s_lshr_b64 s[10:11], s[4:5], s12 -; GFX8-NEXT: s_lshl_b64 s[12:13], s[6:7], s13 +; GFX8-NEXT: s_lshr_b64 s[0:1], s[6:7], s10 +; GFX8-NEXT: s_lshr_b64 s[10:11], s[4:5], s10 +; GFX8-NEXT: s_lshl_b64 s[12:13], s[6:7], s12 ; GFX8-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] ; GFX8-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 ; GFX8-NEXT: s_cmp_lg_u32 s15, 0 @@ -5972,39 +5970,38 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg ; ; GFX9-LABEL: s_fshr_i128: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_movk_i32 s10, 0x7f -; GFX9-NEXT: s_mov_b32 s11, 0 -; GFX9-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11] -; GFX9-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9] +; GFX9-NEXT: s_and_b64 s[10:11], s[8:9], 0x7f +; GFX9-NEXT: s_andn2_b64 s[8:9], 0x7f, s[8:9] +; GFX9-NEXT: s_lshl_b64 s[12:13], s[0:1], 1 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX9-NEXT: s_lshr_b32 s10, s1, 31 -; GFX9-NEXT: s_lshl_b64 s[14:15], s[0:1], 1 -; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[10:11] -; GFX9-NEXT: s_sub_i32 s13, s8, 64 +; GFX9-NEXT: s_lshr_b32 s0, s1, 31 +; GFX9-NEXT: s_mov_b32 s1, 0 +; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX9-NEXT: s_sub_i32 s11, s8, 64 ; GFX9-NEXT: s_sub_i32 s9, 64, s8 ; GFX9-NEXT: s_cmp_lt_u32 s8, 64 ; GFX9-NEXT: s_cselect_b32 s16, 1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s8, 0 ; GFX9-NEXT: s_cselect_b32 s17, 1, 0 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[14:15], s8 -; GFX9-NEXT: s_lshr_b64 s[10:11], s[14:15], s9 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[12:13], s8 +; GFX9-NEXT: s_lshr_b64 s[14:15], s[12:13], s9 ; GFX9-NEXT: s_lshl_b64 s[8:9], s[0:1], s8 -; GFX9-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] -; GFX9-NEXT: s_lshl_b64 s[10:11], s[14:15], s13 +; GFX9-NEXT: s_or_b64 s[8:9], s[14:15], s[8:9] +; GFX9-NEXT: s_lshl_b64 s[12:13], s[12:13], s11 ; GFX9-NEXT: s_cmp_lg_u32 s16, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX9-NEXT: s_cselect_b64 s[8:9], s[8:9], s[10:11] +; GFX9-NEXT: s_cselect_b64 s[8:9], s[8:9], s[12:13] ; GFX9-NEXT: s_cmp_lg_u32 s17, 0 ; GFX9-NEXT: s_cselect_b64 s[8:9], s[0:1], s[8:9] -; GFX9-NEXT: s_sub_i32 s14, s12, 64 -; GFX9-NEXT: s_sub_i32 s13, 64, s12 -; GFX9-NEXT: s_cmp_lt_u32 s12, 64 +; GFX9-NEXT: s_sub_i32 s14, s10, 64 +; GFX9-NEXT: s_sub_i32 s12, 64, s10 +; GFX9-NEXT: s_cmp_lt_u32 s10, 64 ; GFX9-NEXT: s_cselect_b32 s15, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s12, 0 +; GFX9-NEXT: s_cmp_eq_u32 s10, 0 ; GFX9-NEXT: s_cselect_b32 s16, 1, 0 -; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], s12 -; GFX9-NEXT: s_lshr_b64 s[10:11], s[4:5], s12 -; GFX9-NEXT: s_lshl_b64 s[12:13], s[6:7], s13 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], s10 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[4:5], s10 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[6:7], s12 ; GFX9-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] ; GFX9-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 ; GFX9-NEXT: s_cmp_lg_u32 s15, 0 @@ -6019,94 +6016,92 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg ; ; GFX10-LABEL: s_fshr_i128: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_movk_i32 s10, 0x7f -; GFX10-NEXT: s_mov_b32 s11, 0 +; GFX10-NEXT: s_and_b64 s[10:11], s[8:9], 0x7f +; GFX10-NEXT: s_andn2_b64 s[8:9], 0x7f, s[8:9] ; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX10-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11] -; GFX10-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9] -; GFX10-NEXT: s_lshr_b32 s10, s1, 31 +; GFX10-NEXT: s_lshr_b32 s12, s1, 31 +; GFX10-NEXT: s_mov_b32 s13, 0 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[10:11] -; GFX10-NEXT: s_sub_i32 s13, s8, 64 +; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[12:13] +; GFX10-NEXT: s_sub_i32 s11, s8, 64 ; GFX10-NEXT: s_sub_i32 s9, 64, s8 ; GFX10-NEXT: s_cmp_lt_u32 s8, 64 ; GFX10-NEXT: s_cselect_b32 s16, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s8, 0 ; GFX10-NEXT: s_cselect_b32 s17, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[10:11], s[0:1], s9 +; GFX10-NEXT: s_lshr_b64 s[12:13], s[0:1], s9 ; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], s8 ; GFX10-NEXT: s_lshl_b64 s[8:9], s[0:1], s8 -; GFX10-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] -; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s13 +; GFX10-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s11 ; GFX10-NEXT: s_cmp_lg_u32 s16, 0 ; GFX10-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[10:11], s[0:1] +; GFX10-NEXT: s_cselect_b64 s[0:1], s[12:13], s[0:1] ; GFX10-NEXT: s_cmp_lg_u32 s17, 0 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] -; GFX10-NEXT: s_sub_i32 s14, s12, 64 -; GFX10-NEXT: s_sub_i32 s10, 64, s12 -; GFX10-NEXT: s_cmp_lt_u32 s12, 64 +; GFX10-NEXT: s_sub_i32 s14, s10, 64 +; GFX10-NEXT: s_sub_i32 s11, 64, s10 +; GFX10-NEXT: s_cmp_lt_u32 s10, 64 ; GFX10-NEXT: s_cselect_b32 s15, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s12, 0 +; GFX10-NEXT: s_cmp_eq_u32 s10, 0 ; GFX10-NEXT: s_cselect_b32 s16, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[0:1], s[4:5], s12 -; GFX10-NEXT: s_lshl_b64 s[10:11], s[6:7], s10 -; GFX10-NEXT: s_lshr_b64 s[12:13], s[6:7], s12 -; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[10:11] +; GFX10-NEXT: s_lshr_b64 s[0:1], s[4:5], s10 +; GFX10-NEXT: s_lshl_b64 s[12:13], s[6:7], s11 +; GFX10-NEXT: s_lshr_b64 s[10:11], s[6:7], s10 +; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[12:13] ; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 ; GFX10-NEXT: s_cmp_lg_u32 s15, 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[6:7] ; GFX10-NEXT: s_cmp_lg_u32 s16, 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] ; GFX10-NEXT: s_cmp_lg_u32 s15, 0 -; GFX10-NEXT: s_cselect_b64 s[4:5], s[12:13], 0 +; GFX10-NEXT: s_cselect_b64 s[4:5], s[10:11], 0 ; GFX10-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshr_i128: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_movk_i32 s10, 0x7f -; GFX11-NEXT: s_mov_b32 s11, 0 +; GFX11-NEXT: s_and_b64 s[10:11], s[8:9], 0x7f +; GFX11-NEXT: s_and_not1_b64 s[8:9], 0x7f, s[8:9] ; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX11-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11] -; GFX11-NEXT: s_and_not1_b64 s[8:9], s[10:11], s[8:9] -; GFX11-NEXT: s_lshr_b32 s10, s1, 31 +; GFX11-NEXT: s_lshr_b32 s12, s1, 31 +; GFX11-NEXT: s_mov_b32 s13, 0 ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[10:11] -; GFX11-NEXT: s_sub_i32 s13, s8, 64 +; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[12:13] +; GFX11-NEXT: s_sub_i32 s11, s8, 64 ; GFX11-NEXT: s_sub_i32 s9, 64, s8 ; GFX11-NEXT: s_cmp_lt_u32 s8, 64 ; GFX11-NEXT: s_cselect_b32 s16, 1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s8, 0 ; GFX11-NEXT: s_cselect_b32 s17, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[10:11], s[0:1], s9 +; GFX11-NEXT: s_lshr_b64 s[12:13], s[0:1], s9 ; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], s8 ; GFX11-NEXT: s_lshl_b64 s[8:9], s[0:1], s8 -; GFX11-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] -; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s13 +; GFX11-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s11 ; GFX11-NEXT: s_cmp_lg_u32 s16, 0 ; GFX11-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 -; GFX11-NEXT: s_cselect_b64 s[0:1], s[10:11], s[0:1] +; GFX11-NEXT: s_cselect_b64 s[0:1], s[12:13], s[0:1] ; GFX11-NEXT: s_cmp_lg_u32 s17, 0 ; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] -; GFX11-NEXT: s_sub_i32 s14, s12, 64 -; GFX11-NEXT: s_sub_i32 s10, 64, s12 -; GFX11-NEXT: s_cmp_lt_u32 s12, 64 +; GFX11-NEXT: s_sub_i32 s14, s10, 64 +; GFX11-NEXT: s_sub_i32 s11, 64, s10 +; GFX11-NEXT: s_cmp_lt_u32 s10, 64 ; GFX11-NEXT: s_cselect_b32 s15, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s12, 0 +; GFX11-NEXT: s_cmp_eq_u32 s10, 0 ; GFX11-NEXT: s_cselect_b32 s16, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[0:1], s[4:5], s12 -; GFX11-NEXT: s_lshl_b64 s[10:11], s[6:7], s10 -; GFX11-NEXT: s_lshr_b64 s[12:13], s[6:7], s12 -; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[10:11] +; GFX11-NEXT: s_lshr_b64 s[0:1], s[4:5], s10 +; GFX11-NEXT: s_lshl_b64 s[12:13], s[6:7], s11 +; GFX11-NEXT: s_lshr_b64 s[10:11], s[6:7], s10 +; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[12:13] ; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 ; GFX11-NEXT: s_cmp_lg_u32 s15, 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[6:7] ; GFX11-NEXT: s_cmp_lg_u32 s16, 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] ; GFX11-NEXT: s_cmp_lg_u32 s15, 0 -; GFX11-NEXT: s_cselect_b64 s[4:5], s[12:13], 0 +; GFX11-NEXT: s_cselect_b64 s[4:5], s[10:11], 0 ; GFX11-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] ; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX11-NEXT: ; return to shader part epilog @@ -6626,45 +6621,44 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 inreg %amt) { ; GFX6-LABEL: v_fshr_i128_svs: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_movk_i32 s6, 0x7f -; GFX6-NEXT: s_mov_b32 s7, 0 -; GFX6-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] -; GFX6-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] +; GFX6-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f +; GFX6-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] +; GFX6-NEXT: s_lshl_b64 s[8:9], s[0:1], 1 ; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX6-NEXT: s_lshr_b32 s6, s1, 31 -; GFX6-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 -; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[6:7] -; GFX6-NEXT: s_sub_i32 s9, s4, 64 +; GFX6-NEXT: s_lshr_b32 s0, s1, 31 +; GFX6-NEXT: s_mov_b32 s1, 0 +; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX6-NEXT: s_sub_i32 s7, s4, 64 ; GFX6-NEXT: s_sub_i32 s5, 64, s4 ; GFX6-NEXT: s_cmp_lt_u32 s4, 64 ; GFX6-NEXT: s_cselect_b32 s12, 1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s4, 0 ; GFX6-NEXT: s_cselect_b32 s13, 1, 0 -; GFX6-NEXT: s_lshl_b64 s[2:3], s[10:11], s4 -; GFX6-NEXT: s_lshr_b64 s[6:7], s[10:11], s5 +; GFX6-NEXT: s_lshl_b64 s[2:3], s[8:9], s4 +; GFX6-NEXT: s_lshr_b64 s[10:11], s[8:9], s5 ; GFX6-NEXT: s_lshl_b64 s[4:5], s[0:1], s4 -; GFX6-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] -; GFX6-NEXT: s_lshl_b64 s[6:7], s[10:11], s9 +; GFX6-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5] +; GFX6-NEXT: s_lshl_b64 s[8:9], s[8:9], s7 ; GFX6-NEXT: s_cmp_lg_u32 s12, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX6-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] +; GFX6-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9] ; GFX6-NEXT: s_cmp_lg_u32 s13, 0 ; GFX6-NEXT: s_cselect_b64 s[4:5], s[0:1], s[4:5] -; GFX6-NEXT: s_sub_i32 s0, s8, 64 -; GFX6-NEXT: s_sub_i32 s1, 64, s8 -; GFX6-NEXT: s_cmp_lt_u32 s8, 64 -; GFX6-NEXT: s_cselect_b32 s6, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s8, 0 -; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], s8 -; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], s1 +; GFX6-NEXT: s_sub_i32 s0, s6, 64 +; GFX6-NEXT: s_sub_i32 s1, 64, s6 +; GFX6-NEXT: s_cmp_lt_u32 s6, 64 ; GFX6-NEXT: s_cselect_b32 s7, 1, 0 -; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], s8 +; GFX6-NEXT: s_cmp_eq_u32 s6, 0 +; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], s6 +; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], s1 +; GFX6-NEXT: s_cselect_b32 s8, 1, 0 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], s6 ; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], s0 -; GFX6-NEXT: s_and_b32 s0, 1, s6 +; GFX6-NEXT: s_and_b32 s0, 1, s7 ; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX6-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX6-NEXT: s_and_b32 s0, 1, s7 +; GFX6-NEXT: s_and_b32 s0, 1, s8 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 @@ -6680,45 +6674,44 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; ; GFX8-LABEL: v_fshr_i128_svs: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_movk_i32 s6, 0x7f -; GFX8-NEXT: s_mov_b32 s7, 0 -; GFX8-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] -; GFX8-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] +; GFX8-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f +; GFX8-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] +; GFX8-NEXT: s_lshl_b64 s[8:9], s[0:1], 1 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX8-NEXT: s_lshr_b32 s6, s1, 31 -; GFX8-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 -; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[6:7] -; GFX8-NEXT: s_sub_i32 s9, s4, 64 +; GFX8-NEXT: s_lshr_b32 s0, s1, 31 +; GFX8-NEXT: s_mov_b32 s1, 0 +; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX8-NEXT: s_sub_i32 s7, s4, 64 ; GFX8-NEXT: s_sub_i32 s5, 64, s4 ; GFX8-NEXT: s_cmp_lt_u32 s4, 64 ; GFX8-NEXT: s_cselect_b32 s12, 1, 0 ; GFX8-NEXT: s_cmp_eq_u32 s4, 0 ; GFX8-NEXT: s_cselect_b32 s13, 1, 0 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[10:11], s4 -; GFX8-NEXT: s_lshr_b64 s[6:7], s[10:11], s5 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[8:9], s4 +; GFX8-NEXT: s_lshr_b64 s[10:11], s[8:9], s5 ; GFX8-NEXT: s_lshl_b64 s[4:5], s[0:1], s4 -; GFX8-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] -; GFX8-NEXT: s_lshl_b64 s[6:7], s[10:11], s9 +; GFX8-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5] +; GFX8-NEXT: s_lshl_b64 s[8:9], s[8:9], s7 ; GFX8-NEXT: s_cmp_lg_u32 s12, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX8-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] +; GFX8-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9] ; GFX8-NEXT: s_cmp_lg_u32 s13, 0 ; GFX8-NEXT: s_cselect_b64 s[4:5], s[0:1], s[4:5] -; GFX8-NEXT: s_sub_i32 s0, s8, 64 -; GFX8-NEXT: s_sub_i32 s1, 64, s8 -; GFX8-NEXT: s_cmp_lt_u32 s8, 64 -; GFX8-NEXT: s_cselect_b32 s6, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s8, 0 -; GFX8-NEXT: v_lshrrev_b64 v[4:5], s8, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[6:7], s1, v[2:3] +; GFX8-NEXT: s_sub_i32 s0, s6, 64 +; GFX8-NEXT: s_sub_i32 s1, 64, s6 +; GFX8-NEXT: s_cmp_lt_u32 s6, 64 ; GFX8-NEXT: s_cselect_b32 s7, 1, 0 -; GFX8-NEXT: v_lshrrev_b64 v[8:9], s8, v[2:3] +; GFX8-NEXT: s_cmp_eq_u32 s6, 0 +; GFX8-NEXT: v_lshrrev_b64 v[4:5], s6, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[6:7], s1, v[2:3] +; GFX8-NEXT: s_cselect_b32 s8, 1, 0 +; GFX8-NEXT: v_lshrrev_b64 v[8:9], s6, v[2:3] ; GFX8-NEXT: v_lshrrev_b64 v[2:3], s0, v[2:3] -; GFX8-NEXT: s_and_b32 s0, 1, s6 +; GFX8-NEXT: s_and_b32 s0, 1, s7 ; GFX8-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: s_and_b32 s0, 1, s7 +; GFX8-NEXT: s_and_b32 s0, 1, s8 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 @@ -6734,45 +6727,44 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; ; GFX9-LABEL: v_fshr_i128_svs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_movk_i32 s6, 0x7f -; GFX9-NEXT: s_mov_b32 s7, 0 -; GFX9-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] -; GFX9-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f +; GFX9-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] +; GFX9-NEXT: s_lshl_b64 s[8:9], s[0:1], 1 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX9-NEXT: s_lshr_b32 s6, s1, 31 -; GFX9-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 -; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[6:7] -; GFX9-NEXT: s_sub_i32 s9, s4, 64 +; GFX9-NEXT: s_lshr_b32 s0, s1, 31 +; GFX9-NEXT: s_mov_b32 s1, 0 +; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX9-NEXT: s_sub_i32 s7, s4, 64 ; GFX9-NEXT: s_sub_i32 s5, 64, s4 ; GFX9-NEXT: s_cmp_lt_u32 s4, 64 ; GFX9-NEXT: s_cselect_b32 s12, 1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s4, 0 ; GFX9-NEXT: s_cselect_b32 s13, 1, 0 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[10:11], s4 -; GFX9-NEXT: s_lshr_b64 s[6:7], s[10:11], s5 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[8:9], s4 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[8:9], s5 ; GFX9-NEXT: s_lshl_b64 s[4:5], s[0:1], s4 -; GFX9-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] -; GFX9-NEXT: s_lshl_b64 s[6:7], s[10:11], s9 +; GFX9-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5] +; GFX9-NEXT: s_lshl_b64 s[8:9], s[8:9], s7 ; GFX9-NEXT: s_cmp_lg_u32 s12, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX9-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] +; GFX9-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9] ; GFX9-NEXT: s_cmp_lg_u32 s13, 0 ; GFX9-NEXT: s_cselect_b64 s[4:5], s[0:1], s[4:5] -; GFX9-NEXT: s_sub_i32 s0, s8, 64 -; GFX9-NEXT: s_sub_i32 s1, 64, s8 -; GFX9-NEXT: s_cmp_lt_u32 s8, 64 -; GFX9-NEXT: s_cselect_b32 s6, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s8, 0 -; GFX9-NEXT: v_lshrrev_b64 v[4:5], s8, v[0:1] -; GFX9-NEXT: v_lshlrev_b64 v[6:7], s1, v[2:3] +; GFX9-NEXT: s_sub_i32 s0, s6, 64 +; GFX9-NEXT: s_sub_i32 s1, 64, s6 +; GFX9-NEXT: s_cmp_lt_u32 s6, 64 ; GFX9-NEXT: s_cselect_b32 s7, 1, 0 -; GFX9-NEXT: v_lshrrev_b64 v[8:9], s8, v[2:3] +; GFX9-NEXT: s_cmp_eq_u32 s6, 0 +; GFX9-NEXT: v_lshrrev_b64 v[4:5], s6, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[6:7], s1, v[2:3] +; GFX9-NEXT: s_cselect_b32 s8, 1, 0 +; GFX9-NEXT: v_lshrrev_b64 v[8:9], s6, v[2:3] ; GFX9-NEXT: v_lshrrev_b64 v[2:3], s0, v[2:3] -; GFX9-NEXT: s_and_b32 s0, 1, s6 +; GFX9-NEXT: s_and_b32 s0, 1, s7 ; GFX9-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX9-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX9-NEXT: s_and_b32 s0, 1, s7 +; GFX9-NEXT: s_and_b32 s0, 1, s8 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 @@ -6788,45 +6780,44 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; ; GFX10-LABEL: v_fshr_i128_svs: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_movk_i32 s6, 0x7f -; GFX10-NEXT: s_mov_b32 s7, 0 +; GFX10-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f +; GFX10-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] ; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX10-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] -; GFX10-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] -; GFX10-NEXT: s_lshr_b32 s6, s1, 31 +; GFX10-NEXT: s_lshr_b32 s8, s1, 31 +; GFX10-NEXT: s_mov_b32 s9, 0 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] -; GFX10-NEXT: s_sub_i32 s9, s4, 64 +; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] +; GFX10-NEXT: s_sub_i32 s7, s4, 64 ; GFX10-NEXT: s_sub_i32 s5, 64, s4 ; GFX10-NEXT: s_cmp_lt_u32 s4, 64 -; GFX10-NEXT: v_lshrrev_b64 v[4:5], s8, v[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[4:5], s6, v[0:1] ; GFX10-NEXT: s_cselect_b32 s12, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s4, 0 ; GFX10-NEXT: s_cselect_b32 s13, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[6:7], s[0:1], s5 +; GFX10-NEXT: s_lshr_b64 s[8:9], s[0:1], s5 ; GFX10-NEXT: s_lshl_b64 s[10:11], s[2:3], s4 ; GFX10-NEXT: s_lshl_b64 s[4:5], s[0:1], s4 -; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] -; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 +; GFX10-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s7 ; GFX10-NEXT: s_cmp_lg_u32 s12, 0 ; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], 0 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[6:7], s[0:1] +; GFX10-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] ; GFX10-NEXT: s_cmp_lg_u32 s13, 0 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] -; GFX10-NEXT: s_sub_i32 s0, 64, s8 +; GFX10-NEXT: s_sub_i32 s0, 64, s6 ; GFX10-NEXT: v_lshlrev_b64 v[6:7], s0, v[2:3] -; GFX10-NEXT: s_sub_i32 s0, s8, 64 -; GFX10-NEXT: s_cmp_lt_u32 s8, 64 +; GFX10-NEXT: s_sub_i32 s0, s6, 64 +; GFX10-NEXT: s_cmp_lt_u32 s6, 64 ; GFX10-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3] ; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s8, 0 +; GFX10-NEXT: s_cmp_eq_u32 s6, 0 ; GFX10-NEXT: v_or_b32_e32 v4, v4, v6 -; GFX10-NEXT: s_cselect_b32 s6, 1, 0 +; GFX10-NEXT: s_cselect_b32 s7, 1, 0 ; GFX10-NEXT: s_and_b32 s0, 1, s1 ; GFX10-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10-NEXT: s_and_b32 s0, 1, s6 -; GFX10-NEXT: v_lshrrev_b64 v[2:3], s8, v[2:3] +; GFX10-NEXT: s_and_b32 s0, 1, s7 +; GFX10-NEXT: v_lshrrev_b64 v[2:3], s6, v[2:3] ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc_lo @@ -6842,46 +6833,45 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; ; GFX11-LABEL: v_fshr_i128_svs: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_movk_i32 s6, 0x7f -; GFX11-NEXT: s_mov_b32 s7, 0 +; GFX11-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f +; GFX11-NEXT: s_and_not1_b64 s[4:5], 0x7f, s[4:5] ; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX11-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] -; GFX11-NEXT: s_and_not1_b64 s[4:5], s[6:7], s[4:5] -; GFX11-NEXT: s_lshr_b32 s6, s1, 31 +; GFX11-NEXT: s_lshr_b32 s8, s1, 31 +; GFX11-NEXT: s_mov_b32 s9, 0 ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] -; GFX11-NEXT: s_sub_i32 s9, s4, 64 +; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] +; GFX11-NEXT: s_sub_i32 s7, s4, 64 ; GFX11-NEXT: s_sub_i32 s5, 64, s4 ; GFX11-NEXT: s_cmp_lt_u32 s4, 64 -; GFX11-NEXT: v_lshrrev_b64 v[4:5], s8, v[0:1] +; GFX11-NEXT: v_lshrrev_b64 v[4:5], s6, v[0:1] ; GFX11-NEXT: s_cselect_b32 s12, 1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s4, 0 ; GFX11-NEXT: s_cselect_b32 s13, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[6:7], s[0:1], s5 +; GFX11-NEXT: s_lshr_b64 s[8:9], s[0:1], s5 ; GFX11-NEXT: s_lshl_b64 s[10:11], s[2:3], s4 ; GFX11-NEXT: s_lshl_b64 s[4:5], s[0:1], s4 -; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] -; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 +; GFX11-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s7 ; GFX11-NEXT: s_cmp_lg_u32 s12, 0 ; GFX11-NEXT: s_cselect_b64 s[4:5], s[4:5], 0 -; GFX11-NEXT: s_cselect_b64 s[0:1], s[6:7], s[0:1] +; GFX11-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] ; GFX11-NEXT: s_cmp_lg_u32 s13, 0 ; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] -; GFX11-NEXT: s_sub_i32 s0, 64, s8 +; GFX11-NEXT: s_sub_i32 s0, 64, s6 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_lshlrev_b64 v[6:7], s0, v[2:3] -; GFX11-NEXT: s_sub_i32 s0, s8, 64 -; GFX11-NEXT: s_cmp_lt_u32 s8, 64 +; GFX11-NEXT: s_sub_i32 s0, s6, 64 +; GFX11-NEXT: s_cmp_lt_u32 s6, 64 ; GFX11-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3] ; GFX11-NEXT: s_cselect_b32 s1, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s8, 0 +; GFX11-NEXT: s_cmp_eq_u32 s6, 0 ; GFX11-NEXT: v_or_b32_e32 v4, v4, v6 -; GFX11-NEXT: s_cselect_b32 s6, 1, 0 +; GFX11-NEXT: s_cselect_b32 s7, 1, 0 ; GFX11-NEXT: s_and_b32 s0, 1, s1 ; GFX11-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX11-NEXT: s_and_b32 s0, 1, s6 -; GFX11-NEXT: v_lshrrev_b64 v[2:3], s8, v[2:3] +; GFX11-NEXT: s_and_b32 s0, 1, s7 +; GFX11-NEXT: v_lshrrev_b64 v[2:3], s6, v[2:3] ; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 ; GFX11-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v5, v9, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -6904,42 +6894,41 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 inreg %amt) { ; GFX6-LABEL: v_fshr_i128_vss: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b64 s[6:7], 0x7f -; GFX6-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] -; GFX6-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] +; GFX6-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f +; GFX6-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] ; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GFX6-NEXT: s_sub_i32 s5, s4, 64 -; GFX6-NEXT: s_sub_i32 s6, 64, s4 +; GFX6-NEXT: s_sub_i32 s7, 64, s4 ; GFX6-NEXT: v_lshl_b64 v[4:5], v[0:1], 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 31, v1 ; GFX6-NEXT: s_cmp_lt_u32 s4, 64 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX6-NEXT: s_cselect_b32 s7, 1, 0 +; GFX6-NEXT: s_cselect_b32 s8, 1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s4, 0 ; GFX6-NEXT: s_cselect_b32 s9, 1, 0 -; GFX6-NEXT: v_lshr_b64 v[0:1], v[4:5], s6 +; GFX6-NEXT: v_lshr_b64 v[0:1], v[4:5], s7 ; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], s4 ; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], s4 -; GFX6-NEXT: s_and_b32 s4, 1, s7 +; GFX6-NEXT: s_and_b32 s4, 1, s8 ; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 ; GFX6-NEXT: s_and_b32 s4, 1, s9 -; GFX6-NEXT: s_sub_i32 s10, s8, 64 -; GFX6-NEXT: s_sub_i32 s9, 64, s8 +; GFX6-NEXT: s_sub_i32 s10, s6, 64 +; GFX6-NEXT: s_sub_i32 s8, 64, s6 ; GFX6-NEXT: v_or_b32_e32 v6, v0, v6 ; GFX6-NEXT: v_or_b32_e32 v7, v1, v7 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[4:5], s5 -; GFX6-NEXT: s_cmp_lt_u32 s8, 64 +; GFX6-NEXT: s_cmp_lt_u32 s6, 64 ; GFX6-NEXT: s_cselect_b32 s11, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s8, 0 +; GFX6-NEXT: s_cmp_eq_u32 s6, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v9, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc ; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 ; GFX6-NEXT: s_cselect_b32 s12, 1, 0 -; GFX6-NEXT: s_lshr_b64 s[4:5], s[2:3], s8 -; GFX6-NEXT: s_lshr_b64 s[6:7], s[0:1], s8 -; GFX6-NEXT: s_lshl_b64 s[8:9], s[2:3], s9 +; GFX6-NEXT: s_lshr_b64 s[4:5], s[2:3], s6 +; GFX6-NEXT: s_lshr_b64 s[6:7], s[0:1], s6 +; GFX6-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 ; GFX6-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 ; GFX6-NEXT: s_cmp_lg_u32 s11, 0 @@ -6958,42 +6947,41 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; ; GFX8-LABEL: v_fshr_i128_vss: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b64 s[6:7], 0x7f -; GFX8-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] -; GFX8-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] +; GFX8-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f +; GFX8-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX8-NEXT: s_sub_i32 s5, s4, 64 -; GFX8-NEXT: s_sub_i32 s6, 64, s4 +; GFX8-NEXT: s_sub_i32 s7, 64, s4 ; GFX8-NEXT: v_lshlrev_b64 v[4:5], 1, v[0:1] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 31, v1 ; GFX8-NEXT: s_cmp_lt_u32 s4, 64 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX8-NEXT: s_cselect_b32 s7, 1, 0 +; GFX8-NEXT: s_cselect_b32 s8, 1, 0 ; GFX8-NEXT: s_cmp_eq_u32 s4, 0 ; GFX8-NEXT: s_cselect_b32 s9, 1, 0 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], s6, v[4:5] +; GFX8-NEXT: v_lshrrev_b64 v[0:1], s7, v[4:5] ; GFX8-NEXT: v_lshlrev_b64 v[6:7], s4, v[2:3] ; GFX8-NEXT: v_lshlrev_b64 v[8:9], s4, v[4:5] -; GFX8-NEXT: s_and_b32 s4, 1, s7 +; GFX8-NEXT: s_and_b32 s4, 1, s8 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 ; GFX8-NEXT: s_and_b32 s4, 1, s9 -; GFX8-NEXT: s_sub_i32 s10, s8, 64 -; GFX8-NEXT: s_sub_i32 s9, 64, s8 +; GFX8-NEXT: s_sub_i32 s10, s6, 64 +; GFX8-NEXT: s_sub_i32 s8, 64, s6 ; GFX8-NEXT: v_or_b32_e32 v6, v0, v6 ; GFX8-NEXT: v_or_b32_e32 v7, v1, v7 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], s5, v[4:5] -; GFX8-NEXT: s_cmp_lt_u32 s8, 64 +; GFX8-NEXT: s_cmp_lt_u32 s6, 64 ; GFX8-NEXT: s_cselect_b32 s11, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s8, 0 +; GFX8-NEXT: s_cmp_eq_u32 s6, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v5, 0, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 ; GFX8-NEXT: s_cselect_b32 s12, 1, 0 -; GFX8-NEXT: s_lshr_b64 s[4:5], s[2:3], s8 -; GFX8-NEXT: s_lshr_b64 s[6:7], s[0:1], s8 -; GFX8-NEXT: s_lshl_b64 s[8:9], s[2:3], s9 +; GFX8-NEXT: s_lshr_b64 s[4:5], s[2:3], s6 +; GFX8-NEXT: s_lshr_b64 s[6:7], s[0:1], s6 +; GFX8-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 ; GFX8-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 ; GFX8-NEXT: s_cmp_lg_u32 s11, 0 @@ -7012,42 +7000,41 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; ; GFX9-LABEL: v_fshr_i128_vss: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f -; GFX9-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] -; GFX9-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f +; GFX9-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX9-NEXT: s_sub_i32 s5, s4, 64 -; GFX9-NEXT: s_sub_i32 s6, 64, s4 +; GFX9-NEXT: s_sub_i32 s7, 64, s4 ; GFX9-NEXT: v_lshlrev_b64 v[4:5], 1, v[0:1] ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 31, v1 ; GFX9-NEXT: s_cmp_lt_u32 s4, 64 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX9-NEXT: s_cselect_b32 s7, 1, 0 +; GFX9-NEXT: s_cselect_b32 s8, 1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s4, 0 ; GFX9-NEXT: s_cselect_b32 s9, 1, 0 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], s6, v[4:5] +; GFX9-NEXT: v_lshrrev_b64 v[0:1], s7, v[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[6:7], s4, v[2:3] ; GFX9-NEXT: v_lshlrev_b64 v[8:9], s4, v[4:5] -; GFX9-NEXT: s_and_b32 s4, 1, s7 +; GFX9-NEXT: s_and_b32 s4, 1, s8 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 ; GFX9-NEXT: s_and_b32 s4, 1, s9 -; GFX9-NEXT: s_sub_i32 s10, s8, 64 -; GFX9-NEXT: s_sub_i32 s9, 64, s8 +; GFX9-NEXT: s_sub_i32 s10, s6, 64 +; GFX9-NEXT: s_sub_i32 s8, 64, s6 ; GFX9-NEXT: v_or_b32_e32 v6, v0, v6 ; GFX9-NEXT: v_or_b32_e32 v7, v1, v7 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], s5, v[4:5] -; GFX9-NEXT: s_cmp_lt_u32 s8, 64 +; GFX9-NEXT: s_cmp_lt_u32 s6, 64 ; GFX9-NEXT: s_cselect_b32 s11, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s8, 0 +; GFX9-NEXT: s_cmp_eq_u32 s6, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v9, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 ; GFX9-NEXT: s_cselect_b32 s12, 1, 0 -; GFX9-NEXT: s_lshr_b64 s[4:5], s[2:3], s8 -; GFX9-NEXT: s_lshr_b64 s[6:7], s[0:1], s8 -; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], s9 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[2:3], s6 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[0:1], s6 +; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 ; GFX9-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 ; GFX9-NEXT: s_cmp_lg_u32 s11, 0 @@ -7068,41 +7055,40 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; GFX10: ; %bb.0: ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 31, v1 -; GFX10-NEXT: s_mov_b64 s[6:7], 0x7f ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX10-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] -; GFX10-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] +; GFX10-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f +; GFX10-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] +; GFX10-NEXT: s_sub_i32 s7, 64, s4 ; GFX10-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX10-NEXT: s_sub_i32 s6, 64, s4 ; GFX10-NEXT: s_sub_i32 s5, s4, 64 ; GFX10-NEXT: s_cmp_lt_u32 s4, 64 -; GFX10-NEXT: v_lshrrev_b64 v[4:5], s6, v[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[4:5], s7, v[0:1] +; GFX10-NEXT: s_cselect_b32 s8, 1, 0 ; GFX10-NEXT: v_lshlrev_b64 v[6:7], s4, v[2:3] -; GFX10-NEXT: s_cselect_b32 s7, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s4, 0 ; GFX10-NEXT: v_lshlrev_b64 v[8:9], s4, v[0:1] ; GFX10-NEXT: s_cselect_b32 s9, 1, 0 -; GFX10-NEXT: s_and_b32 s4, 1, s7 +; GFX10-NEXT: s_and_b32 s4, 1, s8 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 ; GFX10-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX10-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX10-NEXT: s_and_b32 s4, 1, s9 -; GFX10-NEXT: s_sub_i32 s10, s8, 64 -; GFX10-NEXT: s_sub_i32 s6, 64, s8 -; GFX10-NEXT: s_cmp_lt_u32 s8, 64 +; GFX10-NEXT: s_sub_i32 s10, s6, 64 +; GFX10-NEXT: s_sub_i32 s7, 64, s6 +; GFX10-NEXT: s_cmp_lt_u32 s6, 64 ; GFX10-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc_lo ; GFX10-NEXT: s_cselect_b32 s11, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s8, 0 +; GFX10-NEXT: s_cmp_eq_u32 s6, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 ; GFX10-NEXT: s_cselect_b32 s12, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[4:5], s[0:1], s8 -; GFX10-NEXT: s_lshl_b64 s[6:7], s[2:3], s6 -; GFX10-NEXT: s_lshr_b64 s[8:9], s[2:3], s8 -; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX10-NEXT: s_lshr_b64 s[4:5], s[0:1], s6 +; GFX10-NEXT: s_lshl_b64 s[8:9], s[2:3], s7 +; GFX10-NEXT: s_lshr_b64 s[6:7], s[2:3], s6 +; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 ; GFX10-NEXT: s_cmp_lg_u32 s11, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo @@ -7112,7 +7098,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] ; GFX10-NEXT: s_cmp_lg_u32 s11, 0 ; GFX10-NEXT: v_or_b32_e32 v0, s0, v6 -; GFX10-NEXT: s_cselect_b64 s[2:3], s[8:9], 0 +; GFX10-NEXT: s_cselect_b64 s[2:3], s[6:7], 0 ; GFX10-NEXT: v_or_b32_e32 v1, s1, v7 ; GFX10-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX10-NEXT: v_or_b32_e32 v3, s3, v3 @@ -7122,39 +7108,39 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; GFX11: ; %bb.0: ; GFX11-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 31, v1 -; GFX11-NEXT: s_mov_b64 s[6:7], 0x7f ; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX11-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] -; GFX11-NEXT: s_and_not1_b64 s[4:5], s[6:7], s[4:5] +; GFX11-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f +; GFX11-NEXT: s_and_not1_b64 s[4:5], 0x7f, s[4:5] +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_sub_i32 s7, 64, s4 ; GFX11-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX11-NEXT: s_sub_i32 s6, 64, s4 ; GFX11-NEXT: s_sub_i32 s5, s4, 64 ; GFX11-NEXT: s_cmp_lt_u32 s4, 64 -; GFX11-NEXT: v_lshrrev_b64 v[4:5], s6, v[0:1] +; GFX11-NEXT: v_lshrrev_b64 v[4:5], s7, v[0:1] +; GFX11-NEXT: s_cselect_b32 s8, 1, 0 ; GFX11-NEXT: v_lshlrev_b64 v[6:7], s4, v[2:3] -; GFX11-NEXT: s_cselect_b32 s7, 1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s4, 0 ; GFX11-NEXT: v_lshlrev_b64 v[8:9], s4, v[0:1] ; GFX11-NEXT: s_cselect_b32 s9, 1, 0 -; GFX11-NEXT: s_and_b32 s4, 1, s7 +; GFX11-NEXT: s_and_b32 s4, 1, s8 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] ; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 ; GFX11-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX11-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX11-NEXT: s_and_b32 s4, 1, s9 -; GFX11-NEXT: s_sub_i32 s10, s8, 64 -; GFX11-NEXT: s_sub_i32 s6, 64, s8 -; GFX11-NEXT: s_cmp_lt_u32 s8, 64 +; GFX11-NEXT: s_sub_i32 s10, s6, 64 +; GFX11-NEXT: s_sub_i32 s7, 64, s6 +; GFX11-NEXT: s_cmp_lt_u32 s6, 64 ; GFX11-NEXT: v_dual_cndmask_b32 v6, 0, v8 :: v_dual_cndmask_b32 v7, 0, v9 ; GFX11-NEXT: s_cselect_b32 s11, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s8, 0 +; GFX11-NEXT: s_cmp_eq_u32 s6, 0 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5 ; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 ; GFX11-NEXT: s_cselect_b32 s12, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[4:5], s[0:1], s8 -; GFX11-NEXT: s_lshl_b64 s[6:7], s[2:3], s6 -; GFX11-NEXT: s_lshr_b64 s[8:9], s[2:3], s8 -; GFX11-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX11-NEXT: s_lshr_b64 s[4:5], s[0:1], s6 +; GFX11-NEXT: s_lshl_b64 s[8:9], s[2:3], s7 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[2:3], s6 +; GFX11-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 ; GFX11-NEXT: s_cmp_lg_u32 s11, 0 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v0, v2 :: v_dual_cndmask_b32 v3, v1, v3 @@ -7163,7 +7149,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] ; GFX11-NEXT: s_cmp_lg_u32 s11, 0 ; GFX11-NEXT: v_or_b32_e32 v0, s0, v6 -; GFX11-NEXT: s_cselect_b64 s[2:3], s[8:9], 0 +; GFX11-NEXT: s_cselect_b64 s[2:3], s[6:7], 0 ; GFX11-NEXT: v_or_b32_e32 v1, s1, v7 ; GFX11-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX11-NEXT: v_or_b32_e32 v3, s3, v3 @@ -7301,56 +7287,54 @@ define i128 @v_fshr_i128_65(i128 %lhs, i128 %rhs) { define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs, <2 x i128> inreg %amt) { ; GFX6-LABEL: s_fshr_v2i128: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_movk_i32 s18, 0x7f -; GFX6-NEXT: s_mov_b32 s19, 0 -; GFX6-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19] -; GFX6-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17] -; GFX6-NEXT: s_lshl_b64 s[24:25], s[0:1], 1 +; GFX6-NEXT: s_and_b64 s[18:19], s[16:17], 0x7f +; GFX6-NEXT: s_andn2_b64 s[16:17], 0x7f, s[16:17] ; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX6-NEXT: s_lshr_b32 s0, s1, 31 -; GFX6-NEXT: s_mov_b32 s1, s19 -; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GFX6-NEXT: s_sub_i32 s23, s16, 64 +; GFX6-NEXT: s_lshr_b32 s24, s1, 31 +; GFX6-NEXT: s_mov_b32 s25, 0 +; GFX6-NEXT: s_lshl_b64 s[22:23], s[0:1], 1 +; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[24:25] +; GFX6-NEXT: s_sub_i32 s19, s16, 64 ; GFX6-NEXT: s_sub_i32 s17, 64, s16 ; GFX6-NEXT: s_cmp_lt_u32 s16, 64 -; GFX6-NEXT: s_cselect_b32 s28, 1, 0 +; GFX6-NEXT: s_cselect_b32 s24, 1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s16, 0 -; GFX6-NEXT: s_cselect_b32 s29, 1, 0 -; GFX6-NEXT: s_lshl_b64 s[2:3], s[24:25], s16 -; GFX6-NEXT: s_lshr_b64 s[26:27], s[24:25], s17 +; GFX6-NEXT: s_cselect_b32 s28, 1, 0 +; GFX6-NEXT: s_lshl_b64 s[2:3], s[22:23], s16 +; GFX6-NEXT: s_lshr_b64 s[26:27], s[22:23], s17 ; GFX6-NEXT: s_lshl_b64 s[16:17], s[0:1], s16 ; GFX6-NEXT: s_or_b64 s[16:17], s[26:27], s[16:17] -; GFX6-NEXT: s_lshl_b64 s[24:25], s[24:25], s23 -; GFX6-NEXT: s_cmp_lg_u32 s28, 0 +; GFX6-NEXT: s_lshl_b64 s[22:23], s[22:23], s19 +; GFX6-NEXT: s_cmp_lg_u32 s24, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX6-NEXT: s_cselect_b64 s[16:17], s[16:17], s[24:25] -; GFX6-NEXT: s_cmp_lg_u32 s29, 0 +; GFX6-NEXT: s_cselect_b64 s[16:17], s[16:17], s[22:23] +; GFX6-NEXT: s_cmp_lg_u32 s28, 0 ; GFX6-NEXT: s_cselect_b64 s[16:17], s[0:1], s[16:17] -; GFX6-NEXT: s_sub_i32 s26, s22, 64 -; GFX6-NEXT: s_sub_i32 s24, 64, s22 -; GFX6-NEXT: s_cmp_lt_u32 s22, 64 +; GFX6-NEXT: s_sub_i32 s24, s18, 64 +; GFX6-NEXT: s_sub_i32 s22, 64, s18 +; GFX6-NEXT: s_cmp_lt_u32 s18, 64 +; GFX6-NEXT: s_cselect_b32 s26, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s18, 0 ; GFX6-NEXT: s_cselect_b32 s27, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s22, 0 -; GFX6-NEXT: s_cselect_b32 s28, 1, 0 -; GFX6-NEXT: s_lshr_b64 s[0:1], s[10:11], s22 -; GFX6-NEXT: s_lshr_b64 s[22:23], s[8:9], s22 -; GFX6-NEXT: s_lshl_b64 s[24:25], s[10:11], s24 -; GFX6-NEXT: s_or_b64 s[22:23], s[22:23], s[24:25] -; GFX6-NEXT: s_lshr_b64 s[10:11], s[10:11], s26 +; GFX6-NEXT: s_lshr_b64 s[0:1], s[10:11], s18 +; GFX6-NEXT: s_lshr_b64 s[18:19], s[8:9], s18 +; GFX6-NEXT: s_lshl_b64 s[22:23], s[10:11], s22 +; GFX6-NEXT: s_or_b64 s[18:19], s[18:19], s[22:23] +; GFX6-NEXT: s_lshr_b64 s[10:11], s[10:11], s24 +; GFX6-NEXT: s_cmp_lg_u32 s26, 0 +; GFX6-NEXT: s_cselect_b64 s[10:11], s[18:19], s[10:11] ; GFX6-NEXT: s_cmp_lg_u32 s27, 0 -; GFX6-NEXT: s_cselect_b64 s[10:11], s[22:23], s[10:11] -; GFX6-NEXT: s_cmp_lg_u32 s28, 0 ; GFX6-NEXT: s_cselect_b64 s[8:9], s[8:9], s[10:11] -; GFX6-NEXT: s_cmp_lg_u32 s27, 0 +; GFX6-NEXT: s_cmp_lg_u32 s26, 0 ; GFX6-NEXT: s_cselect_b64 s[10:11], s[0:1], 0 ; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] ; GFX6-NEXT: s_or_b64 s[2:3], s[16:17], s[10:11] -; GFX6-NEXT: s_and_b64 s[8:9], s[20:21], s[18:19] -; GFX6-NEXT: s_andn2_b64 s[10:11], s[18:19], s[20:21] +; GFX6-NEXT: s_and_b64 s[8:9], s[20:21], 0x7f +; GFX6-NEXT: s_andn2_b64 s[10:11], 0x7f, s[20:21] ; GFX6-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 -; GFX6-NEXT: s_lshr_b32 s18, s5, 31 +; GFX6-NEXT: s_lshr_b32 s24, s5, 31 ; GFX6-NEXT: s_lshl_b64 s[16:17], s[4:5], 1 -; GFX6-NEXT: s_or_b64 s[4:5], s[6:7], s[18:19] +; GFX6-NEXT: s_or_b64 s[4:5], s[6:7], s[24:25] ; GFX6-NEXT: s_sub_i32 s9, s10, 64 ; GFX6-NEXT: s_sub_i32 s11, 64, s10 ; GFX6-NEXT: s_cmp_lt_u32 s10, 64 @@ -7390,56 +7374,54 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; ; GFX8-LABEL: s_fshr_v2i128: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_movk_i32 s18, 0x7f -; GFX8-NEXT: s_mov_b32 s19, 0 -; GFX8-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19] -; GFX8-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17] -; GFX8-NEXT: s_lshl_b64 s[24:25], s[0:1], 1 +; GFX8-NEXT: s_and_b64 s[18:19], s[16:17], 0x7f +; GFX8-NEXT: s_andn2_b64 s[16:17], 0x7f, s[16:17] ; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX8-NEXT: s_lshr_b32 s0, s1, 31 -; GFX8-NEXT: s_mov_b32 s1, s19 -; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GFX8-NEXT: s_sub_i32 s23, s16, 64 +; GFX8-NEXT: s_lshr_b32 s24, s1, 31 +; GFX8-NEXT: s_mov_b32 s25, 0 +; GFX8-NEXT: s_lshl_b64 s[22:23], s[0:1], 1 +; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[24:25] +; GFX8-NEXT: s_sub_i32 s19, s16, 64 ; GFX8-NEXT: s_sub_i32 s17, 64, s16 ; GFX8-NEXT: s_cmp_lt_u32 s16, 64 -; GFX8-NEXT: s_cselect_b32 s28, 1, 0 +; GFX8-NEXT: s_cselect_b32 s24, 1, 0 ; GFX8-NEXT: s_cmp_eq_u32 s16, 0 -; GFX8-NEXT: s_cselect_b32 s29, 1, 0 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[24:25], s16 -; GFX8-NEXT: s_lshr_b64 s[26:27], s[24:25], s17 +; GFX8-NEXT: s_cselect_b32 s28, 1, 0 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[22:23], s16 +; GFX8-NEXT: s_lshr_b64 s[26:27], s[22:23], s17 ; GFX8-NEXT: s_lshl_b64 s[16:17], s[0:1], s16 ; GFX8-NEXT: s_or_b64 s[16:17], s[26:27], s[16:17] -; GFX8-NEXT: s_lshl_b64 s[24:25], s[24:25], s23 -; GFX8-NEXT: s_cmp_lg_u32 s28, 0 +; GFX8-NEXT: s_lshl_b64 s[22:23], s[22:23], s19 +; GFX8-NEXT: s_cmp_lg_u32 s24, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX8-NEXT: s_cselect_b64 s[16:17], s[16:17], s[24:25] -; GFX8-NEXT: s_cmp_lg_u32 s29, 0 +; GFX8-NEXT: s_cselect_b64 s[16:17], s[16:17], s[22:23] +; GFX8-NEXT: s_cmp_lg_u32 s28, 0 ; GFX8-NEXT: s_cselect_b64 s[16:17], s[0:1], s[16:17] -; GFX8-NEXT: s_sub_i32 s26, s22, 64 -; GFX8-NEXT: s_sub_i32 s24, 64, s22 -; GFX8-NEXT: s_cmp_lt_u32 s22, 64 +; GFX8-NEXT: s_sub_i32 s24, s18, 64 +; GFX8-NEXT: s_sub_i32 s22, 64, s18 +; GFX8-NEXT: s_cmp_lt_u32 s18, 64 +; GFX8-NEXT: s_cselect_b32 s26, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s18, 0 ; GFX8-NEXT: s_cselect_b32 s27, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s22, 0 -; GFX8-NEXT: s_cselect_b32 s28, 1, 0 -; GFX8-NEXT: s_lshr_b64 s[0:1], s[10:11], s22 -; GFX8-NEXT: s_lshr_b64 s[22:23], s[8:9], s22 -; GFX8-NEXT: s_lshl_b64 s[24:25], s[10:11], s24 -; GFX8-NEXT: s_or_b64 s[22:23], s[22:23], s[24:25] -; GFX8-NEXT: s_lshr_b64 s[10:11], s[10:11], s26 +; GFX8-NEXT: s_lshr_b64 s[0:1], s[10:11], s18 +; GFX8-NEXT: s_lshr_b64 s[18:19], s[8:9], s18 +; GFX8-NEXT: s_lshl_b64 s[22:23], s[10:11], s22 +; GFX8-NEXT: s_or_b64 s[18:19], s[18:19], s[22:23] +; GFX8-NEXT: s_lshr_b64 s[10:11], s[10:11], s24 +; GFX8-NEXT: s_cmp_lg_u32 s26, 0 +; GFX8-NEXT: s_cselect_b64 s[10:11], s[18:19], s[10:11] ; GFX8-NEXT: s_cmp_lg_u32 s27, 0 -; GFX8-NEXT: s_cselect_b64 s[10:11], s[22:23], s[10:11] -; GFX8-NEXT: s_cmp_lg_u32 s28, 0 ; GFX8-NEXT: s_cselect_b64 s[8:9], s[8:9], s[10:11] -; GFX8-NEXT: s_cmp_lg_u32 s27, 0 +; GFX8-NEXT: s_cmp_lg_u32 s26, 0 ; GFX8-NEXT: s_cselect_b64 s[10:11], s[0:1], 0 ; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] ; GFX8-NEXT: s_or_b64 s[2:3], s[16:17], s[10:11] -; GFX8-NEXT: s_and_b64 s[8:9], s[20:21], s[18:19] -; GFX8-NEXT: s_andn2_b64 s[10:11], s[18:19], s[20:21] +; GFX8-NEXT: s_and_b64 s[8:9], s[20:21], 0x7f +; GFX8-NEXT: s_andn2_b64 s[10:11], 0x7f, s[20:21] ; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 -; GFX8-NEXT: s_lshr_b32 s18, s5, 31 +; GFX8-NEXT: s_lshr_b32 s24, s5, 31 ; GFX8-NEXT: s_lshl_b64 s[16:17], s[4:5], 1 -; GFX8-NEXT: s_or_b64 s[4:5], s[6:7], s[18:19] +; GFX8-NEXT: s_or_b64 s[4:5], s[6:7], s[24:25] ; GFX8-NEXT: s_sub_i32 s9, s10, 64 ; GFX8-NEXT: s_sub_i32 s11, 64, s10 ; GFX8-NEXT: s_cmp_lt_u32 s10, 64 @@ -7479,56 +7461,54 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; ; GFX9-LABEL: s_fshr_v2i128: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_movk_i32 s18, 0x7f -; GFX9-NEXT: s_mov_b32 s19, 0 -; GFX9-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19] -; GFX9-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17] -; GFX9-NEXT: s_lshl_b64 s[24:25], s[0:1], 1 +; GFX9-NEXT: s_and_b64 s[18:19], s[16:17], 0x7f +; GFX9-NEXT: s_andn2_b64 s[16:17], 0x7f, s[16:17] ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX9-NEXT: s_lshr_b32 s0, s1, 31 -; GFX9-NEXT: s_mov_b32 s1, s19 -; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GFX9-NEXT: s_sub_i32 s23, s16, 64 +; GFX9-NEXT: s_lshr_b32 s24, s1, 31 +; GFX9-NEXT: s_mov_b32 s25, 0 +; GFX9-NEXT: s_lshl_b64 s[22:23], s[0:1], 1 +; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[24:25] +; GFX9-NEXT: s_sub_i32 s19, s16, 64 ; GFX9-NEXT: s_sub_i32 s17, 64, s16 ; GFX9-NEXT: s_cmp_lt_u32 s16, 64 -; GFX9-NEXT: s_cselect_b32 s28, 1, 0 +; GFX9-NEXT: s_cselect_b32 s24, 1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s16, 0 -; GFX9-NEXT: s_cselect_b32 s29, 1, 0 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[24:25], s16 -; GFX9-NEXT: s_lshr_b64 s[26:27], s[24:25], s17 +; GFX9-NEXT: s_cselect_b32 s28, 1, 0 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[22:23], s16 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[22:23], s17 ; GFX9-NEXT: s_lshl_b64 s[16:17], s[0:1], s16 ; GFX9-NEXT: s_or_b64 s[16:17], s[26:27], s[16:17] -; GFX9-NEXT: s_lshl_b64 s[24:25], s[24:25], s23 -; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_lshl_b64 s[22:23], s[22:23], s19 +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX9-NEXT: s_cselect_b64 s[16:17], s[16:17], s[24:25] -; GFX9-NEXT: s_cmp_lg_u32 s29, 0 +; GFX9-NEXT: s_cselect_b64 s[16:17], s[16:17], s[22:23] +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 ; GFX9-NEXT: s_cselect_b64 s[16:17], s[0:1], s[16:17] -; GFX9-NEXT: s_sub_i32 s26, s22, 64 -; GFX9-NEXT: s_sub_i32 s24, 64, s22 -; GFX9-NEXT: s_cmp_lt_u32 s22, 64 +; GFX9-NEXT: s_sub_i32 s24, s18, 64 +; GFX9-NEXT: s_sub_i32 s22, 64, s18 +; GFX9-NEXT: s_cmp_lt_u32 s18, 64 +; GFX9-NEXT: s_cselect_b32 s26, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s18, 0 ; GFX9-NEXT: s_cselect_b32 s27, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s22, 0 -; GFX9-NEXT: s_cselect_b32 s28, 1, 0 -; GFX9-NEXT: s_lshr_b64 s[0:1], s[10:11], s22 -; GFX9-NEXT: s_lshr_b64 s[22:23], s[8:9], s22 -; GFX9-NEXT: s_lshl_b64 s[24:25], s[10:11], s24 -; GFX9-NEXT: s_or_b64 s[22:23], s[22:23], s[24:25] -; GFX9-NEXT: s_lshr_b64 s[10:11], s[10:11], s26 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[10:11], s18 +; GFX9-NEXT: s_lshr_b64 s[18:19], s[8:9], s18 +; GFX9-NEXT: s_lshl_b64 s[22:23], s[10:11], s22 +; GFX9-NEXT: s_or_b64 s[18:19], s[18:19], s[22:23] +; GFX9-NEXT: s_lshr_b64 s[10:11], s[10:11], s24 +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cselect_b64 s[10:11], s[18:19], s[10:11] ; GFX9-NEXT: s_cmp_lg_u32 s27, 0 -; GFX9-NEXT: s_cselect_b64 s[10:11], s[22:23], s[10:11] -; GFX9-NEXT: s_cmp_lg_u32 s28, 0 ; GFX9-NEXT: s_cselect_b64 s[8:9], s[8:9], s[10:11] -; GFX9-NEXT: s_cmp_lg_u32 s27, 0 +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 ; GFX9-NEXT: s_cselect_b64 s[10:11], s[0:1], 0 ; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] ; GFX9-NEXT: s_or_b64 s[2:3], s[16:17], s[10:11] -; GFX9-NEXT: s_and_b64 s[8:9], s[20:21], s[18:19] -; GFX9-NEXT: s_andn2_b64 s[10:11], s[18:19], s[20:21] +; GFX9-NEXT: s_and_b64 s[8:9], s[20:21], 0x7f +; GFX9-NEXT: s_andn2_b64 s[10:11], 0x7f, s[20:21] ; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 -; GFX9-NEXT: s_lshr_b32 s18, s5, 31 +; GFX9-NEXT: s_lshr_b32 s24, s5, 31 ; GFX9-NEXT: s_lshl_b64 s[16:17], s[4:5], 1 -; GFX9-NEXT: s_or_b64 s[4:5], s[6:7], s[18:19] +; GFX9-NEXT: s_or_b64 s[4:5], s[6:7], s[24:25] ; GFX9-NEXT: s_sub_i32 s9, s10, 64 ; GFX9-NEXT: s_sub_i32 s11, 64, s10 ; GFX9-NEXT: s_cmp_lt_u32 s10, 64 @@ -7568,56 +7548,54 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; ; GFX10-LABEL: s_fshr_v2i128: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_movk_i32 s18, 0x7f -; GFX10-NEXT: s_mov_b32 s19, 0 +; GFX10-NEXT: s_and_b64 s[18:19], s[16:17], 0x7f +; GFX10-NEXT: s_andn2_b64 s[16:17], 0x7f, s[16:17] ; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX10-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19] -; GFX10-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17] -; GFX10-NEXT: s_lshr_b32 s24, s1, 31 -; GFX10-NEXT: s_mov_b32 s25, s19 +; GFX10-NEXT: s_lshr_b32 s22, s1, 31 +; GFX10-NEXT: s_mov_b32 s23, 0 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[24:25] -; GFX10-NEXT: s_sub_i32 s23, s16, 64 +; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[22:23] +; GFX10-NEXT: s_sub_i32 s19, s16, 64 ; GFX10-NEXT: s_sub_i32 s17, 64, s16 ; GFX10-NEXT: s_cmp_lt_u32 s16, 64 -; GFX10-NEXT: s_cselect_b32 s28, 1, 0 +; GFX10-NEXT: s_cselect_b32 s22, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s16, 0 -; GFX10-NEXT: s_cselect_b32 s29, 1, 0 +; GFX10-NEXT: s_cselect_b32 s28, 1, 0 ; GFX10-NEXT: s_lshr_b64 s[24:25], s[0:1], s17 ; GFX10-NEXT: s_lshl_b64 s[26:27], s[2:3], s16 ; GFX10-NEXT: s_lshl_b64 s[16:17], s[0:1], s16 ; GFX10-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] -; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s23 -; GFX10-NEXT: s_cmp_lg_u32 s28, 0 +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s19 +; GFX10-NEXT: s_cmp_lg_u32 s22, 0 ; GFX10-NEXT: s_cselect_b64 s[16:17], s[16:17], 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[24:25], s[0:1] -; GFX10-NEXT: s_cmp_lg_u32 s29, 0 +; GFX10-NEXT: s_cmp_lg_u32 s28, 0 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] -; GFX10-NEXT: s_sub_i32 s26, s22, 64 -; GFX10-NEXT: s_sub_i32 s23, 64, s22 -; GFX10-NEXT: s_cmp_lt_u32 s22, 64 +; GFX10-NEXT: s_sub_i32 s22, s18, 64 +; GFX10-NEXT: s_sub_i32 s19, 64, s18 +; GFX10-NEXT: s_cmp_lt_u32 s18, 64 +; GFX10-NEXT: s_cselect_b32 s26, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 s18, 0 ; GFX10-NEXT: s_cselect_b32 s27, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s22, 0 -; GFX10-NEXT: s_cselect_b32 s28, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], s22 -; GFX10-NEXT: s_lshl_b64 s[24:25], s[10:11], s23 -; GFX10-NEXT: s_lshr_b64 s[22:23], s[10:11], s22 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], s18 +; GFX10-NEXT: s_lshl_b64 s[24:25], s[10:11], s19 +; GFX10-NEXT: s_lshr_b64 s[18:19], s[10:11], s18 ; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[24:25] -; GFX10-NEXT: s_lshr_b64 s[10:11], s[10:11], s26 -; GFX10-NEXT: s_cmp_lg_u32 s27, 0 +; GFX10-NEXT: s_lshr_b64 s[10:11], s[10:11], s22 +; GFX10-NEXT: s_cmp_lg_u32 s26, 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11] -; GFX10-NEXT: s_cmp_lg_u32 s28, 0 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] ; GFX10-NEXT: s_cmp_lg_u32 s27, 0 -; GFX10-NEXT: s_cselect_b64 s[8:9], s[22:23], 0 -; GFX10-NEXT: s_andn2_b64 s[10:11], s[18:19], s[20:21] -; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX10-NEXT: s_and_b64 s[8:9], s[20:21], s[18:19] +; GFX10-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] +; GFX10-NEXT: s_cmp_lg_u32 s26, 0 +; GFX10-NEXT: s_cselect_b64 s[8:9], s[18:19], 0 +; GFX10-NEXT: s_andn2_b64 s[10:11], 0x7f, s[20:21] ; GFX10-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 -; GFX10-NEXT: s_lshr_b32 s18, s5, 31 +; GFX10-NEXT: s_lshr_b32 s22, s5, 31 +; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] +; GFX10-NEXT: s_and_b64 s[8:9], s[20:21], 0x7f ; GFX10-NEXT: s_or_b64 s[0:1], s[16:17], s[0:1] ; GFX10-NEXT: s_lshl_b64 s[4:5], s[4:5], 1 -; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[18:19] +; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[22:23] ; GFX10-NEXT: s_sub_i32 s9, s10, 64 ; GFX10-NEXT: s_sub_i32 s11, 64, s10 ; GFX10-NEXT: s_cmp_lt_u32 s10, 64 @@ -7657,56 +7635,54 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; ; GFX11-LABEL: s_fshr_v2i128: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_movk_i32 s18, 0x7f -; GFX11-NEXT: s_mov_b32 s19, 0 +; GFX11-NEXT: s_and_b64 s[18:19], s[16:17], 0x7f +; GFX11-NEXT: s_and_not1_b64 s[16:17], 0x7f, s[16:17] ; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX11-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19] -; GFX11-NEXT: s_and_not1_b64 s[16:17], s[18:19], s[16:17] -; GFX11-NEXT: s_lshr_b32 s24, s1, 31 -; GFX11-NEXT: s_mov_b32 s25, s19 +; GFX11-NEXT: s_lshr_b32 s22, s1, 31 +; GFX11-NEXT: s_mov_b32 s23, 0 ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[24:25] -; GFX11-NEXT: s_sub_i32 s23, s16, 64 +; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[22:23] +; GFX11-NEXT: s_sub_i32 s19, s16, 64 ; GFX11-NEXT: s_sub_i32 s17, 64, s16 ; GFX11-NEXT: s_cmp_lt_u32 s16, 64 -; GFX11-NEXT: s_cselect_b32 s28, 1, 0 +; GFX11-NEXT: s_cselect_b32 s22, 1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s16, 0 -; GFX11-NEXT: s_cselect_b32 s29, 1, 0 +; GFX11-NEXT: s_cselect_b32 s28, 1, 0 ; GFX11-NEXT: s_lshr_b64 s[24:25], s[0:1], s17 ; GFX11-NEXT: s_lshl_b64 s[26:27], s[2:3], s16 ; GFX11-NEXT: s_lshl_b64 s[16:17], s[0:1], s16 ; GFX11-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] -; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s23 -; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s19 +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 ; GFX11-NEXT: s_cselect_b64 s[16:17], s[16:17], 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[24:25], s[0:1] -; GFX11-NEXT: s_cmp_lg_u32 s29, 0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 ; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] -; GFX11-NEXT: s_sub_i32 s26, s22, 64 -; GFX11-NEXT: s_sub_i32 s23, 64, s22 -; GFX11-NEXT: s_cmp_lt_u32 s22, 64 +; GFX11-NEXT: s_sub_i32 s22, s18, 64 +; GFX11-NEXT: s_sub_i32 s19, 64, s18 +; GFX11-NEXT: s_cmp_lt_u32 s18, 64 +; GFX11-NEXT: s_cselect_b32 s26, 1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s18, 0 ; GFX11-NEXT: s_cselect_b32 s27, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s22, 0 -; GFX11-NEXT: s_cselect_b32 s28, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[0:1], s[8:9], s22 -; GFX11-NEXT: s_lshl_b64 s[24:25], s[10:11], s23 -; GFX11-NEXT: s_lshr_b64 s[22:23], s[10:11], s22 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[8:9], s18 +; GFX11-NEXT: s_lshl_b64 s[24:25], s[10:11], s19 +; GFX11-NEXT: s_lshr_b64 s[18:19], s[10:11], s18 ; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[24:25] -; GFX11-NEXT: s_lshr_b64 s[10:11], s[10:11], s26 -; GFX11-NEXT: s_cmp_lg_u32 s27, 0 +; GFX11-NEXT: s_lshr_b64 s[10:11], s[10:11], s22 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11] -; GFX11-NEXT: s_cmp_lg_u32 s28, 0 -; GFX11-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] ; GFX11-NEXT: s_cmp_lg_u32 s27, 0 -; GFX11-NEXT: s_cselect_b64 s[8:9], s[22:23], 0 -; GFX11-NEXT: s_and_not1_b64 s[10:11], s[18:19], s[20:21] -; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX11-NEXT: s_and_b64 s[8:9], s[20:21], s[18:19] +; GFX11-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_cselect_b64 s[8:9], s[18:19], 0 +; GFX11-NEXT: s_and_not1_b64 s[10:11], 0x7f, s[20:21] ; GFX11-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 -; GFX11-NEXT: s_lshr_b32 s18, s5, 31 +; GFX11-NEXT: s_lshr_b32 s22, s5, 31 +; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] +; GFX11-NEXT: s_and_b64 s[8:9], s[20:21], 0x7f ; GFX11-NEXT: s_or_b64 s[0:1], s[16:17], s[0:1] ; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], 1 -; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[18:19] +; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[22:23] ; GFX11-NEXT: s_sub_i32 s9, s10, 64 ; GFX11-NEXT: s_sub_i32 s11, 64, s10 ; GFX11-NEXT: s_cmp_lt_u32 s10, 64 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll index d6957be8ab8ff..64c3cd4e8c067 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll @@ -146,6 +146,7 @@ define amdgpu_ps void @insertelement_v_v2i16_s_s(ptr addrspace(1) %ptr, i16 inre ; GFX7-NEXT: s_lshl_b32 s1, s1, s0 ; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX7-NEXT: s_not_b32 s0, s0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 @@ -515,6 +516,7 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(ptr addrspace(1) %ptr, i16 inre ; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_not_b32_e32 v1, v1 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 @@ -608,6 +610,7 @@ define amdgpu_ps void @insertelement_v_v2i16_v_s(ptr addrspace(1) %ptr, i16 %val ; GFX7-NEXT: v_lshlrev_b32_e32 v1, s0, v1 ; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX7-NEXT: s_not_b32 s0, s0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 @@ -701,6 +704,7 @@ define amdgpu_ps void @insertelement_v_v2i16_v_v(ptr addrspace(1) %ptr, i16 %val ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_not_b32_e32 v1, v1 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 @@ -866,6 +870,7 @@ define amdgpu_ps void @insertelement_v_v4i16_s_s(ptr addrspace(1) %ptr, i16 inre ; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 ; GFX7-NEXT: s_not_b32 s1, s1 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc @@ -1406,6 +1411,7 @@ define amdgpu_ps void @insertelement_v_v4i16_s_v(ptr addrspace(1) %ptr, i16 inre ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX7-NEXT: v_not_b32_e32 v2, v2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc @@ -1529,6 +1535,7 @@ define amdgpu_ps void @insertelement_v_v4i16_v_s(ptr addrspace(1) %ptr, i16 %val ; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 ; GFX7-NEXT: s_not_b32 s1, s1 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc @@ -1653,6 +1660,7 @@ define amdgpu_ps void @insertelement_v_v4i16_v_v(ptr addrspace(1) %ptr, i16 %val ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX7-NEXT: v_not_b32_e32 v3, v3 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc @@ -1976,6 +1984,7 @@ define amdgpu_ps void @insertelement_v_v8i16_s_s(ptr addrspace(1) %ptr, i16 inre ; GFX7-NEXT: s_not_b32 s6, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s10, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc @@ -2693,6 +2702,7 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(ptr addrspace(1) %ptr, i16 inre ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 ; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s10, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc @@ -2846,6 +2856,7 @@ define amdgpu_ps void @insertelement_v_v8i16_v_s(ptr addrspace(1) %ptr, i16 %val ; GFX7-NEXT: s_not_b32 s5, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s10, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc @@ -3001,6 +3012,7 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(ptr addrspace(1) %ptr, i16 %val ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 ; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s10, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc @@ -3351,6 +3363,7 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(ptr addrspace(1) %ptr, i16 inr ; GFX7-NEXT: s_lshl_b32 s1, s1, s0 ; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX7-NEXT: s_not_b32 s0, s0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_movrels_b32_e32 v0, v2 @@ -4289,11 +4302,11 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr ; ; GFX7-LABEL: insertelement_v_v16i16_s_v: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s18, 0 -; GFX7-NEXT: s_mov_b32 s19, 0xf000 -; GFX7-NEXT: s_mov_b64 s[16:17], 0 -; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[16:19], 0 addr64 -; GFX7-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[16:19], 0 addr64 offset:16 +; GFX7-NEXT: s_mov_b32 s14, 0 +; GFX7-NEXT: s_mov_b32 s15, 0xf000 +; GFX7-NEXT: s_mov_b64 s[12:13], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[12:15], 0 addr64 +; GFX7-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[12:15], 0 addr64 offset:16 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v2 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v2 ; GFX7-NEXT: s_and_b32 s0, s2, 0xffff @@ -4309,7 +4322,7 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr ; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 ; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 -; GFX7-NEXT: s_mov_b32 s18, -1 +; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_cndmask_b32_e32 v11, v3, v4, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v5, s[0:1] @@ -4325,13 +4338,14 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr ; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] +; GFX7-NEXT: s_mov_b64 s[12:13], 0 ; GFX7-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] ; GFX7-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] ; GFX7-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] ; GFX7-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] -; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 -; GFX7-NEXT: s_mov_b64 s[16:17], 16 -; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_v_v16i16_s_v: @@ -4523,6 +4537,7 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(ptr addrspace(1) %ptr, i16 %va ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0 ; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX7-NEXT: s_not_b32 s0, s0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_movrels_b32_e32 v1, v3 @@ -4686,11 +4701,11 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va ; ; GFX7-LABEL: insertelement_v_v16i16_v_v: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s18, 0 -; GFX7-NEXT: s_mov_b32 s19, 0xf000 -; GFX7-NEXT: s_mov_b64 s[16:17], 0 -; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[16:19], 0 addr64 -; GFX7-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[16:19], 0 addr64 offset:16 +; GFX7-NEXT: s_mov_b32 s14, 0 +; GFX7-NEXT: s_mov_b32 s15, 0xf000 +; GFX7-NEXT: s_mov_b64 s[12:13], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[12:15], 0 addr64 +; GFX7-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[12:15], 0 addr64 offset:16 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v3 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v3 @@ -4706,7 +4721,7 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va ; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 ; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 -; GFX7-NEXT: s_mov_b32 s18, -1 +; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] @@ -4722,13 +4737,14 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va ; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v12, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v2, v6, v12, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e64 v3, v7, v12, s[2:3] +; GFX7-NEXT: s_mov_b64 s[12:13], 0 ; GFX7-NEXT: v_cndmask_b32_e64 v4, v8, v12, s[4:5] ; GFX7-NEXT: v_cndmask_b32_e64 v5, v9, v12, s[6:7] ; GFX7-NEXT: v_cndmask_b32_e64 v6, v10, v12, s[8:9] ; GFX7-NEXT: v_cndmask_b32_e64 v7, v11, v12, s[10:11] -; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 -; GFX7-NEXT: s_mov_b64 s[16:17], 16 -; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_v_v16i16_v_v: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll index d531462ae9cc9..16b702edff2db 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll @@ -159,6 +159,7 @@ define amdgpu_ps void @insertelement_v_v2i8_s_s(ptr addrspace(1) %ptr, i8 inreg ; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 8, v0 @@ -581,6 +582,7 @@ define amdgpu_ps void @insertelement_v_v2i8_s_v(ptr addrspace(1) %ptr, i8 inreg ; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 8, v0 @@ -681,6 +683,7 @@ define amdgpu_ps void @insertelement_v_v2i8_v_s(ptr addrspace(1) %ptr, i8 %val, ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 @@ -781,6 +784,7 @@ define amdgpu_ps void @insertelement_v_v2i8_v_v(ptr addrspace(1) %ptr, i8 %val, ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 @@ -943,6 +947,7 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(ptr addrspace(1) %ptr, i8 inreg ; GFX7-NEXT: s_lshl_b32 s1, s1, s0 ; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0 ; GFX7-NEXT: s_not_b32 s0, s0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 @@ -1312,6 +1317,7 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(ptr addrspace(1) %ptr, i8 inreg ; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 ; GFX7-NEXT: v_not_b32_e32 v1, v1 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 @@ -1405,6 +1411,7 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(ptr addrspace(1) %ptr, i8 %val, ; GFX7-NEXT: v_lshlrev_b32_e32 v1, s0, v1 ; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0 ; GFX7-NEXT: s_not_b32 s0, s0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 @@ -1498,6 +1505,7 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(ptr addrspace(1) %ptr, i8 %val, ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 ; GFX7-NEXT: v_not_b32_e32 v1, v1 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 @@ -1741,6 +1749,7 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(ptr addrspace(1) %ptr, i8 inreg ; GFX7-NEXT: s_lshl_b32 s1, 0xff, s1 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 ; GFX7-NEXT: s_not_b32 s1, s1 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc @@ -2281,6 +2290,7 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(ptr addrspace(1) %ptr, i8 inreg ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX7-NEXT: v_not_b32_e32 v2, v2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc @@ -2404,6 +2414,7 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(ptr addrspace(1) %ptr, i8 %val, ; GFX7-NEXT: s_lshl_b32 s1, 0xff, s1 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 ; GFX7-NEXT: s_not_b32 s1, s1 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc @@ -2528,6 +2539,7 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(ptr addrspace(1) %ptr, i8 %val, ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX7-NEXT: v_not_b32_e32 v3, v3 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc @@ -2851,6 +2863,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(ptr addrspace(1) %ptr, i8 inreg ; GFX7-NEXT: s_not_b32 s6, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s10, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc @@ -3568,6 +3581,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 ; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s10, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc @@ -3721,6 +3735,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(ptr addrspace(1) %ptr, i8 %val, ; GFX7-NEXT: s_not_b32 s5, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s10, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc @@ -3876,6 +3891,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val, ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 ; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s10, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll index 61439021a8875..dc9cbb498dab4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll @@ -687,17 +687,17 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) { ; GPRIDX: ; %bb.0: ; %entry ; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GPRIDX-NEXT: s_mov_b32 s18, 0 +; GPRIDX-NEXT: s_mov_b32 s16, 0 +; GPRIDX-NEXT: s_mov_b32 s14, 0 +; GPRIDX-NEXT: s_mov_b32 s12, 0 +; GPRIDX-NEXT: s_mov_b32 s8, 0 ; GPRIDX-NEXT: s_mov_b64 s[4:5], 1.0 ; GPRIDX-NEXT: s_mov_b32 s19, 0x40200000 ; GPRIDX-NEXT: s_mov_b32 s17, 0x401c0000 -; GPRIDX-NEXT: s_mov_b32 s16, s18 ; GPRIDX-NEXT: s_mov_b32 s15, 0x40180000 -; GPRIDX-NEXT: s_mov_b32 s14, s18 ; GPRIDX-NEXT: s_mov_b32 s13, 0x40140000 -; GPRIDX-NEXT: s_mov_b32 s12, s18 ; GPRIDX-NEXT: s_mov_b64 s[10:11], 4.0 ; GPRIDX-NEXT: s_mov_b32 s9, 0x40080000 -; GPRIDX-NEXT: s_mov_b32 s8, s18 ; GPRIDX-NEXT: s_mov_b64 s[6:7], 2.0 ; GPRIDX-NEXT: v_mov_b32_e32 v3, s4 ; GPRIDX-NEXT: v_mov_b32_e32 v4, s5 @@ -753,17 +753,17 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) { ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s18, 0 +; GFX10-NEXT: s_mov_b32 s16, 0 +; GFX10-NEXT: s_mov_b32 s14, 0 +; GFX10-NEXT: s_mov_b32 s12, 0 +; GFX10-NEXT: s_mov_b32 s8, 0 ; GFX10-NEXT: s_mov_b64 s[4:5], 1.0 ; GFX10-NEXT: s_mov_b32 s19, 0x40200000 ; GFX10-NEXT: s_mov_b32 s17, 0x401c0000 -; GFX10-NEXT: s_mov_b32 s16, s18 ; GFX10-NEXT: s_mov_b32 s15, 0x40180000 -; GFX10-NEXT: s_mov_b32 s14, s18 ; GFX10-NEXT: s_mov_b32 s13, 0x40140000 -; GFX10-NEXT: s_mov_b32 s12, s18 ; GFX10-NEXT: s_mov_b64 s[10:11], 4.0 ; GFX10-NEXT: s_mov_b32 s9, 0x40080000 -; GFX10-NEXT: s_mov_b32 s8, s18 ; GFX10-NEXT: s_mov_b64 s[6:7], 2.0 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: v_mov_b32_e32 v4, s5 @@ -820,16 +820,16 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s14, 0 ; GFX11-NEXT: s_mov_b32 s15, 0x40200000 +; GFX11-NEXT: s_mov_b32 s12, 0 +; GFX11-NEXT: s_mov_b32 s10, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_mov_b64 s[0:1], 1.0 ; GFX11-NEXT: s_mov_b32 s13, 0x401c0000 -; GFX11-NEXT: s_mov_b32 s12, s14 ; GFX11-NEXT: s_mov_b32 s11, 0x40180000 -; GFX11-NEXT: s_mov_b32 s10, s14 ; GFX11-NEXT: s_mov_b32 s9, 0x40140000 -; GFX11-NEXT: s_mov_b32 s8, s14 ; GFX11-NEXT: s_mov_b64 s[6:7], 4.0 ; GFX11-NEXT: s_mov_b32 s5, 0x40080000 -; GFX11-NEXT: s_mov_b32 s4, s14 ; GFX11-NEXT: s_mov_b64 s[2:3], 2.0 ; GFX11-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v17, s14 ; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-flat.mir index 2714982163fec..f1c3673ae29dd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-flat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-flat.mir @@ -23,6 +23,7 @@ body: | ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 ; GFX7-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] + ; ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3 ; GFX9-NEXT: {{ $}} @@ -32,6 +33,7 @@ body: | ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 ; GFX9-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] + ; ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3 ; GFX10-NEXT: {{ $}} @@ -41,6 +43,7 @@ body: | ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 ; GFX10-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] + ; ; GFX11-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3 ; GFX11-NEXT: {{ $}} @@ -75,18 +78,17 @@ body: | ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4, implicit $exec ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] + ; ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_gep4 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3 ; GFX9-NEXT: {{ $}} @@ -96,6 +98,7 @@ body: | ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 ; GFX9-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 4, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] + ; ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_gep4 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3 ; GFX10-NEXT: {{ $}} @@ -103,18 +106,17 @@ body: | ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4, implicit $exec ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] + ; ; GFX11-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_gep4 ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3 ; GFX11-NEXT: {{ $}} @@ -153,6 +155,7 @@ body: | ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3 ; GFX7-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]] + ; ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5 ; GFX9-NEXT: {{ $}} @@ -162,6 +165,7 @@ body: | ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3 ; GFX9-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]] + ; ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5 ; GFX10-NEXT: {{ $}} @@ -171,6 +175,7 @@ body: | ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3 ; GFX10-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]] + ; ; GFX11-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5 ; GFX11-NEXT: {{ $}} @@ -205,18 +210,17 @@ body: | ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4, implicit $exec ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) + ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]] + ; ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat_gep4 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5 ; GFX9-NEXT: {{ $}} @@ -226,6 +230,7 @@ body: | ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3 ; GFX9-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 4, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]] + ; ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat_gep4 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5 ; GFX10-NEXT: {{ $}} @@ -233,18 +238,17 @@ body: | ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4, implicit $exec ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) + ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]] + ; ; GFX11-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat_gep4 ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5 ; GFX11-NEXT: {{ $}} @@ -281,18 +285,17 @@ body: | ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967292, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4, implicit $exec ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] + ; ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_gepm4 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3 ; GFX9-NEXT: {{ $}} @@ -300,18 +303,17 @@ body: | ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967292, implicit $exec - ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4, implicit $exec ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX9-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX9-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] + ; ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_gepm4 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3 ; GFX10-NEXT: {{ $}} @@ -319,18 +321,17 @@ body: | ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967292, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4, implicit $exec ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] + ; ; GFX11-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_gepm4 ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3 ; GFX11-NEXT: {{ $}} @@ -338,17 +339,15 @@ body: | ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967292, implicit $exec - ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4, implicit $exec ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX11-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX11-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 @@ -378,6 +377,7 @@ body: | ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 ; GFX7-NEXT: FLAT_ATOMIC_CMPSWAP [[COPY]], [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_nortn ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3 ; GFX9-NEXT: {{ $}} @@ -386,6 +386,7 @@ body: | ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 ; GFX9-NEXT: FLAT_ATOMIC_CMPSWAP [[COPY]], [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_nortn ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3 ; GFX10-NEXT: {{ $}} @@ -394,6 +395,7 @@ body: | ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 ; GFX10-NEXT: FLAT_ATOMIC_CMPSWAP [[COPY]], [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; ; GFX11-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_nortn ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3 ; GFX11-NEXT: {{ $}} @@ -427,6 +429,7 @@ body: | ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3 ; GFX7-NEXT: FLAT_ATOMIC_CMPSWAP_X2 [[COPY]], [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) + ; ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat_nortn ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5 ; GFX9-NEXT: {{ $}} @@ -435,6 +438,7 @@ body: | ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3 ; GFX9-NEXT: FLAT_ATOMIC_CMPSWAP_X2 [[COPY]], [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) + ; ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat_nortn ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5 ; GFX10-NEXT: {{ $}} @@ -443,6 +447,7 @@ body: | ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3 ; GFX10-NEXT: FLAT_ATOMIC_CMPSWAP_X2 [[COPY]], [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) + ; ; GFX11-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat_nortn ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5 ; GFX11-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir index bc131f53910d9..e1ef96bec0fdb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir @@ -144,17 +144,15 @@ body: | ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec - ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4, implicit $exec ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-FLAT-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-FLAT-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX7-FLAT-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-FLAT-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1) + ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] ; ; GFX8-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_gep4 @@ -164,17 +162,15 @@ body: | ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec - ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4, implicit $exec ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1) + ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] ; ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_gep4 @@ -344,17 +340,15 @@ body: | ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3 - ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec - ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4, implicit $exec ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-FLAT-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-FLAT-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX7-FLAT-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-FLAT-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64), addrspace 1) + ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]] ; ; GFX8-LABEL: name: amdgpu_atomic_cmpxchg_s64_global_gep4 @@ -364,17 +358,15 @@ body: | ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3 - ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec - ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4, implicit $exec ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64), addrspace 1) + ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64), addrspace 1) ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]] ; ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s64_global_gep4 @@ -423,22 +415,20 @@ body: | ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967292, implicit $exec - ; GFX6-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4, implicit $exec ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX6-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX6-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE3]], %subreg.sub2_sub3 - ; GFX6-NEXT: [[BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE4]], 0, 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1) + ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 + ; GFX6-NEXT: [[BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1) ; GFX6-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN]].sub0 ; GFX6-NEXT: $vgpr0 = COPY [[COPY7]] ; @@ -449,22 +439,20 @@ body: | ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967292, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4, implicit $exec ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX7-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE3]], %subreg.sub2_sub3 - ; GFX7-NEXT: [[BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE4]], 0, 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1) + ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 + ; GFX7-NEXT: [[BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1) ; GFX7-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN]].sub0 ; GFX7-NEXT: $vgpr0 = COPY [[COPY7]] ; @@ -475,17 +463,15 @@ body: | ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967292, implicit $exec - ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4, implicit $exec ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-FLAT-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-FLAT-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX7-FLAT-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-FLAT-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1) + ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] ; ; GFX8-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_gepm4 @@ -495,17 +481,15 @@ body: | ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967292, implicit $exec - ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4, implicit $exec ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1) + ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] ; ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_gepm4 @@ -834,19 +818,17 @@ body: | ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX7-FLAT-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 - ; GFX7-FLAT-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4095 ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 - ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0 ; GFX7-FLAT-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 - ; GFX7-FLAT-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-FLAT-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1 ; GFX7-FLAT-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY3]], [[COPY4]], implicit-def $scc ; GFX7-FLAT-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY5]], [[COPY6]], implicit-def dead $scc, implicit $scc - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX7-FLAT-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] - ; GFX7-FLAT-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY7]], [[REG_SEQUENCE2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1) + ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; GFX7-FLAT-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY7]], [[REG_SEQUENCE1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] ; ; GFX8-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_sgpr_ptr_offset_4095 @@ -855,19 +837,17 @@ body: | ; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 - ; GFX8-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX8-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4095 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX8-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1 ; GFX8-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY3]], [[COPY4]], implicit-def $scc ; GFX8-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY5]], [[COPY6]], implicit-def dead $scc, implicit $scc - ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 - ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] - ; GFX8-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY7]], [[REG_SEQUENCE2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; GFX8-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY7]], [[REG_SEQUENCE1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] ; ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_sgpr_ptr_offset_4095 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-flat.mir index 78071022fc05c..868f08d805caa 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-flat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-flat.mir @@ -21,6 +21,7 @@ body: | ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] + ; ; GFX9-LABEL: name: flat_atomicrmw_add_s32 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} @@ -28,6 +29,7 @@ body: | ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] + ; ; GFX10-LABEL: name: flat_atomicrmw_add_s32 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} @@ -35,6 +37,7 @@ body: | ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] + ; ; GFX11-LABEL: name: flat_atomicrmw_add_s32 ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} @@ -64,18 +67,21 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; ; GFX9-LABEL: name: flat_atomicrmw_add_s32_nortn ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; ; GFX10-LABEL: name: flat_atomicrmw_add_s32_nortn ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX10-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; ; GFX11-LABEL: name: flat_atomicrmw_add_s32_nortn ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} @@ -102,18 +108,17 @@ body: | ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] + ; ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset2047 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} @@ -121,23 +126,23 @@ body: | ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] + ; ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset2047 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] + ; ; GFX11-LABEL: name: flat_atomicrmw_add_s32_offset2047 ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} @@ -168,39 +173,38 @@ body: | ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset2047_nortn ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 2047, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset2047_nortn ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; ; GFX11-LABEL: name: flat_atomicrmw_add_s32_offset2047_nortn ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} @@ -229,18 +233,17 @@ body: | ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] + ; ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset2048 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} @@ -248,23 +251,23 @@ body: | ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2048, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] + ; ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset2048 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] + ; ; GFX11-LABEL: name: flat_atomicrmw_add_s32_offset2048 ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} @@ -295,39 +298,38 @@ body: | ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset2048_nortn ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 2048, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset2048_nortn ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; ; GFX11-LABEL: name: flat_atomicrmw_add_s32_offset2048_nortn ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} @@ -356,18 +358,17 @@ body: | ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] + ; ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset4095 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} @@ -375,23 +376,23 @@ body: | ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 4095, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] + ; ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset4095 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] + ; ; GFX11-LABEL: name: flat_atomicrmw_add_s32_offset4095 ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} @@ -422,39 +423,38 @@ body: | ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset4095_nortn ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 4095, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset4095_nortn ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; ; GFX11-LABEL: name: flat_atomicrmw_add_s32_offset4095_nortn ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} @@ -483,68 +483,63 @@ body: | ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] + ; ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset4097 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec - ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX9-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX9-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] + ; ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset4097 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] + ; ; GFX11-LABEL: name: flat_atomicrmw_add_s32_offset4097 ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec - ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 @@ -569,65 +564,60 @@ body: | ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset4097_nortn ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec - ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX9-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX9-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset4097_nortn ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; ; GFX11-LABEL: name: flat_atomicrmw_add_s32_offset4097_nortn ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec - ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX11-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX11-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s64) = G_CONSTANT i64 4097 @@ -652,6 +642,7 @@ body: | ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]] + ; ; GFX9-LABEL: name: flat_atomicrmw_add_s64 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9-NEXT: {{ $}} @@ -659,6 +650,7 @@ body: | ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX9-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]] + ; ; GFX10-LABEL: name: flat_atomicrmw_add_s64 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX10-NEXT: {{ $}} @@ -666,6 +658,7 @@ body: | ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]] + ; ; GFX11-LABEL: name: flat_atomicrmw_add_s64 ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX11-NEXT: {{ $}} @@ -695,18 +688,21 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX7-NEXT: FLAT_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) + ; ; GFX9-LABEL: name: flat_atomicrmw_add_s64_nortn ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX9-NEXT: FLAT_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) + ; ; GFX10-LABEL: name: flat_atomicrmw_add_s64_nortn ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX10-NEXT: FLAT_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) + ; ; GFX11-LABEL: name: flat_atomicrmw_add_s64_nortn ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX11-NEXT: {{ $}} @@ -733,18 +729,17 @@ body: | ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]] + ; ; GFX9-LABEL: name: flat_atomicrmw_add_s64_offset4095 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9-NEXT: {{ $}} @@ -752,23 +747,23 @@ body: | ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX9-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 4095, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]] + ; ; GFX10-LABEL: name: flat_atomicrmw_add_s64_offset4095 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]] + ; ; GFX11-LABEL: name: flat_atomicrmw_add_s64_offset4095 ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX11-NEXT: {{ $}} @@ -799,39 +794,38 @@ body: | ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: FLAT_ATOMIC_ADD_X2 [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: FLAT_ATOMIC_ADD_X2 [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) + ; ; GFX9-LABEL: name: flat_atomicrmw_add_s64_offset4095_nortn ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX9-NEXT: FLAT_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 4095, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) + ; ; GFX10-LABEL: name: flat_atomicrmw_add_s64_offset4095_nortn ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: FLAT_ATOMIC_ADD_X2 [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: FLAT_ATOMIC_ADD_X2 [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) + ; ; GFX11-LABEL: name: flat_atomicrmw_add_s64_offset4095_nortn ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX11-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-global.mir index d3eaee283d644..43d9b911b9f33 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-global.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-global.mir @@ -27,6 +27,7 @@ body: | ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX6-NEXT: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_ADDR64_RTN]] + ; ; GFX7-LABEL: name: global_atomicrmw_add_s32 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7-NEXT: {{ $}} @@ -34,6 +35,7 @@ body: | ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1) ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] + ; ; GFX9-LABEL: name: global_atomicrmw_add_s32 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} @@ -41,6 +43,7 @@ body: | ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1) ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]] + ; ; GFX10-LABEL: name: global_atomicrmw_add_s32 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} @@ -48,6 +51,7 @@ body: | ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX10-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1) ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]] + ; ; GFX11-LABEL: name: global_atomicrmw_add_s32 ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} @@ -82,24 +86,28 @@ body: | ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX6-NEXT: BUFFER_ATOMIC_ADD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1) + ; ; GFX7-LABEL: name: global_atomicrmw_add_s32_nortn ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1) + ; ; GFX9-LABEL: name: global_atomicrmw_add_s32_nortn ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9-NEXT: GLOBAL_ATOMIC_ADD [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1) + ; ; GFX10-LABEL: name: global_atomicrmw_add_s32_nortn ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX10-NEXT: GLOBAL_ATOMIC_ADD [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1) + ; ; GFX11-LABEL: name: global_atomicrmw_add_s32_nortn ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} @@ -133,23 +141,23 @@ body: | ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX6-NEXT: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_ADDR64_RTN]] + ; ; GFX7-LABEL: name: global_atomicrmw_add_s32_offset2047 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1) ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] + ; ; GFX9-LABEL: name: global_atomicrmw_add_s32_offset2047 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} @@ -157,6 +165,7 @@ body: | ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1) ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]] + ; ; GFX10-LABEL: name: global_atomicrmw_add_s32_offset2047 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} @@ -164,6 +173,7 @@ body: | ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX10-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1) ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]] + ; ; GFX11-LABEL: name: global_atomicrmw_add_s32_offset2047 ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} @@ -200,34 +210,36 @@ body: | ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX6-NEXT: BUFFER_ATOMIC_ADD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1) + ; ; GFX7-LABEL: name: global_atomicrmw_add_s32_offset2047_nortn ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1) + ; ; GFX9-LABEL: name: global_atomicrmw_add_s32_offset2047_nortn ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9-NEXT: GLOBAL_ATOMIC_ADD [[COPY]], [[COPY1]], 2047, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1) + ; ; GFX10-LABEL: name: global_atomicrmw_add_s32_offset2047_nortn ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX10-NEXT: GLOBAL_ATOMIC_ADD [[COPY]], [[COPY1]], 2047, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1) + ; ; GFX11-LABEL: name: global_atomicrmw_add_s32_offset2047_nortn ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} @@ -263,23 +275,23 @@ body: | ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX6-NEXT: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 2048, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_ADDR64_RTN]] + ; ; GFX7-LABEL: name: global_atomicrmw_add_s32_offset2048 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1) ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] + ; ; GFX9-LABEL: name: global_atomicrmw_add_s32_offset2048 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} @@ -287,23 +299,23 @@ body: | ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2048, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1) ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]] + ; ; GFX10-LABEL: name: global_atomicrmw_add_s32_offset2048 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1) ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]] + ; ; GFX11-LABEL: name: global_atomicrmw_add_s32_offset2048 ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} @@ -340,44 +352,44 @@ body: | ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX6-NEXT: BUFFER_ATOMIC_ADD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 2048, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1) + ; ; GFX7-LABEL: name: global_atomicrmw_add_s32_offset2048_nortn ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1) + ; ; GFX9-LABEL: name: global_atomicrmw_add_s32_offset2048_nortn ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9-NEXT: GLOBAL_ATOMIC_ADD [[COPY]], [[COPY1]], 2048, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1) + ; ; GFX10-LABEL: name: global_atomicrmw_add_s32_offset2048_nortn ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: GLOBAL_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: GLOBAL_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1) + ; ; GFX11-LABEL: name: global_atomicrmw_add_s32_offset2048_nortn ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} @@ -413,23 +425,23 @@ body: | ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX6-NEXT: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_ADDR64_RTN]] + ; ; GFX7-LABEL: name: global_atomicrmw_add_s32_offset4095 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1) ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] + ; ; GFX9-LABEL: name: global_atomicrmw_add_s32_offset4095 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} @@ -437,23 +449,23 @@ body: | ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 4095, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1) ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]] + ; ; GFX10-LABEL: name: global_atomicrmw_add_s32_offset4095 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1) ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]] + ; ; GFX11-LABEL: name: global_atomicrmw_add_s32_offset4095 ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} @@ -490,44 +502,44 @@ body: | ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX6-NEXT: BUFFER_ATOMIC_ADD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1) + ; ; GFX7-LABEL: name: global_atomicrmw_add_s32_offset4095_nortn ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1) + ; ; GFX9-LABEL: name: global_atomicrmw_add_s32_offset4095_nortn ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9-NEXT: GLOBAL_ATOMIC_ADD [[COPY]], [[COPY1]], 4095, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1) + ; ; GFX10-LABEL: name: global_atomicrmw_add_s32_offset4095_nortn ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: GLOBAL_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: GLOBAL_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1) + ; ; GFX11-LABEL: name: global_atomicrmw_add_s32_offset4095_nortn ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} @@ -564,73 +576,69 @@ body: | ; GFX6-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4097 ; GFX6-NEXT: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_ADDR64_RTN]] + ; ; GFX7-LABEL: name: global_atomicrmw_add_s32_offset4097 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1) ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] + ; ; GFX9-LABEL: name: global_atomicrmw_add_s32_offset4097 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec - ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX9-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1) + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX9-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1) ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]] + ; ; GFX10-LABEL: name: global_atomicrmw_add_s32_offset4097 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1) ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]] + ; ; GFX11-LABEL: name: global_atomicrmw_add_s32_offset4097 ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec - ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1) + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 @@ -662,70 +670,66 @@ body: | ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX6-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4097 ; GFX6-NEXT: BUFFER_ATOMIC_ADD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1) + ; ; GFX7-LABEL: name: global_atomicrmw_add_s32_offset4097_nortn ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1) + ; ; GFX9-LABEL: name: global_atomicrmw_add_s32_offset4097_nortn ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec - ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX9-NEXT: GLOBAL_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1) + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX9-NEXT: GLOBAL_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1) + ; ; GFX10-LABEL: name: global_atomicrmw_add_s32_offset4097_nortn ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: GLOBAL_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: GLOBAL_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1) + ; ; GFX11-LABEL: name: global_atomicrmw_add_s32_offset4097_nortn ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec - ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX11-NEXT: GLOBAL_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1) + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX11-NEXT: GLOBAL_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s64) = G_CONSTANT i64 4097 @@ -755,6 +759,7 @@ body: | ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX6-NEXT: [[BUFFER_ATOMIC_ADD_X2_ADDR64_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_ADD_X2_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 1, implicit $exec :: (load store seq_cst (s64), addrspace 1) ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_ATOMIC_ADD_X2_ADDR64_RTN]] + ; ; GFX7-LABEL: name: global_atomicrmw_add_s64 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7-NEXT: {{ $}} @@ -762,6 +767,7 @@ body: | ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64), addrspace 1) ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]] + ; ; GFX9-LABEL: name: global_atomicrmw_add_s64 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9-NEXT: {{ $}} @@ -769,6 +775,7 @@ body: | ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX9-NEXT: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s64), addrspace 1) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_ATOMIC_ADD_X2_RTN]] + ; ; GFX10-LABEL: name: global_atomicrmw_add_s64 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX10-NEXT: {{ $}} @@ -776,6 +783,7 @@ body: | ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX10-NEXT: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s64), addrspace 1) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_ATOMIC_ADD_X2_RTN]] + ; ; GFX11-LABEL: name: global_atomicrmw_add_s64 ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX11-NEXT: {{ $}} @@ -810,24 +818,28 @@ body: | ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX6-NEXT: BUFFER_ATOMIC_ADD_X2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, implicit $exec :: (load store seq_cst (s64), addrspace 1) + ; ; GFX7-LABEL: name: global_atomicrmw_add_s64_nortn ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX7-NEXT: FLAT_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64), addrspace 1) + ; ; GFX9-LABEL: name: global_atomicrmw_add_s64_nortn ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX9-NEXT: GLOBAL_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s64), addrspace 1) + ; ; GFX10-LABEL: name: global_atomicrmw_add_s64_nortn ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX10-NEXT: GLOBAL_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s64), addrspace 1) + ; ; GFX11-LABEL: name: global_atomicrmw_add_s64_nortn ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX11-NEXT: {{ $}} @@ -861,23 +873,23 @@ body: | ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX6-NEXT: [[BUFFER_ATOMIC_ADD_X2_ADDR64_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_ADD_X2_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 1, implicit $exec :: (load store seq_cst (s64), addrspace 1) ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_ATOMIC_ADD_X2_ADDR64_RTN]] + ; ; GFX7-LABEL: name: global_atomicrmw_add_s64_offset4095 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64), addrspace 1) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64), addrspace 1) ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]] + ; ; GFX9-LABEL: name: global_atomicrmw_add_s64_offset4095 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9-NEXT: {{ $}} @@ -885,23 +897,23 @@ body: | ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX9-NEXT: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 4095, 1, implicit $exec :: (load store seq_cst (s64), addrspace 1) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_ATOMIC_ADD_X2_RTN]] + ; ; GFX10-LABEL: name: global_atomicrmw_add_s64_offset4095 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s64), addrspace 1) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s64), addrspace 1) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_ATOMIC_ADD_X2_RTN]] + ; ; GFX11-LABEL: name: global_atomicrmw_add_s64_offset4095 ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX11-NEXT: {{ $}} @@ -938,44 +950,44 @@ body: | ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX6-NEXT: BUFFER_ATOMIC_ADD_X2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, implicit $exec :: (load store seq_cst (s64), addrspace 1) + ; ; GFX7-LABEL: name: global_atomicrmw_add_s64_offset4095_nortn ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: FLAT_ATOMIC_ADD_X2 [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64), addrspace 1) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: FLAT_ATOMIC_ADD_X2 [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64), addrspace 1) + ; ; GFX9-LABEL: name: global_atomicrmw_add_s64_offset4095_nortn ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX9-NEXT: GLOBAL_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 4095, 0, implicit $exec :: (load store seq_cst (s64), addrspace 1) + ; ; GFX10-LABEL: name: global_atomicrmw_add_s64_offset4095_nortn ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: GLOBAL_ATOMIC_ADD_X2 [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s64), addrspace 1) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: GLOBAL_ATOMIC_ADD_X2 [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s64), addrspace 1) + ; ; GFX11-LABEL: name: global_atomicrmw_add_s64_offset4095_nortn ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX11-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir index dadc6f32dfb8c..ca3fd71f6c981 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir @@ -136,58 +136,34 @@ tracksRegLiveness: true body: | bb.0: ; WAVE64-LABEL: name: constant_v_s64 - ; WAVE64: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; WAVE64-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; WAVE64: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec + ; WAVE64-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec + ; WAVE64-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -1, implicit $exec + ; WAVE64-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -54, implicit $exec + ; WAVE64-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 27, implicit $exec + ; WAVE64-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967295, implicit $exec + ; WAVE64-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; WAVE64-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; WAVE64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 - ; WAVE64-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; WAVE64-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; WAVE64-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec + ; WAVE64-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec ; WAVE64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_2]], %subreg.sub0, [[V_MOV_B32_e32_3]], %subreg.sub1 - ; WAVE64-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec - ; WAVE64-NEXT: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; WAVE64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_4]], %subreg.sub0, [[V_MOV_B32_e32_5]], %subreg.sub1 - ; WAVE64-NEXT: [[V_MOV_B32_e32_6:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967242, implicit $exec - ; WAVE64-NEXT: [[V_MOV_B32_e32_7:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; WAVE64-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_6]], %subreg.sub0, [[V_MOV_B32_e32_7]], %subreg.sub1 - ; WAVE64-NEXT: [[V_MOV_B32_e32_8:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec - ; WAVE64-NEXT: [[V_MOV_B32_e32_9:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; WAVE64-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_8]], %subreg.sub0, [[V_MOV_B32_e32_9]], %subreg.sub1 - ; WAVE64-NEXT: [[V_MOV_B32_e32_10:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec - ; WAVE64-NEXT: [[V_MOV_B32_e32_11:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; WAVE64-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_10]], %subreg.sub0, [[V_MOV_B32_e32_11]], %subreg.sub1 - ; WAVE64-NEXT: [[V_MOV_B32_e32_12:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; WAVE64-NEXT: [[V_MOV_B32_e32_13:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; WAVE64-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_12]], %subreg.sub0, [[V_MOV_B32_e32_13]], %subreg.sub1 - ; WAVE64-NEXT: [[V_MOV_B32_e32_14:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec - ; WAVE64-NEXT: [[V_MOV_B32_e32_15:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec - ; WAVE64-NEXT: [[REG_SEQUENCE7:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_14]], %subreg.sub0, [[V_MOV_B32_e32_15]], %subreg.sub1 - ; WAVE64-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]], implicit [[REG_SEQUENCE4]], implicit [[REG_SEQUENCE5]], implicit [[REG_SEQUENCE6]], implicit [[REG_SEQUENCE7]] + ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]] ; ; WAVE32-LABEL: name: constant_v_s64 - ; WAVE32: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; WAVE32-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; WAVE32: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec + ; WAVE32-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec + ; WAVE32-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -1, implicit $exec + ; WAVE32-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -54, implicit $exec + ; WAVE32-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 27, implicit $exec + ; WAVE32-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967295, implicit $exec + ; WAVE32-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; WAVE32-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; WAVE32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 - ; WAVE32-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; WAVE32-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; WAVE32-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec + ; WAVE32-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec ; WAVE32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_2]], %subreg.sub0, [[V_MOV_B32_e32_3]], %subreg.sub1 - ; WAVE32-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec - ; WAVE32-NEXT: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; WAVE32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_4]], %subreg.sub0, [[V_MOV_B32_e32_5]], %subreg.sub1 - ; WAVE32-NEXT: [[V_MOV_B32_e32_6:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967242, implicit $exec - ; WAVE32-NEXT: [[V_MOV_B32_e32_7:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; WAVE32-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_6]], %subreg.sub0, [[V_MOV_B32_e32_7]], %subreg.sub1 - ; WAVE32-NEXT: [[V_MOV_B32_e32_8:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec - ; WAVE32-NEXT: [[V_MOV_B32_e32_9:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; WAVE32-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_8]], %subreg.sub0, [[V_MOV_B32_e32_9]], %subreg.sub1 - ; WAVE32-NEXT: [[V_MOV_B32_e32_10:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec - ; WAVE32-NEXT: [[V_MOV_B32_e32_11:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; WAVE32-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_10]], %subreg.sub0, [[V_MOV_B32_e32_11]], %subreg.sub1 - ; WAVE32-NEXT: [[V_MOV_B32_e32_12:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; WAVE32-NEXT: [[V_MOV_B32_e32_13:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; WAVE32-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_12]], %subreg.sub0, [[V_MOV_B32_e32_13]], %subreg.sub1 - ; WAVE32-NEXT: [[V_MOV_B32_e32_14:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec - ; WAVE32-NEXT: [[V_MOV_B32_e32_15:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec - ; WAVE32-NEXT: [[REG_SEQUENCE7:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_14]], %subreg.sub0, [[V_MOV_B32_e32_15]], %subreg.sub1 - ; WAVE32-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]], implicit [[REG_SEQUENCE4]], implicit [[REG_SEQUENCE5]], implicit [[REG_SEQUENCE6]], implicit [[REG_SEQUENCE7]] + ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]] %0:vgpr(s64) = G_CONSTANT i64 0 %1:vgpr(s64) = G_CONSTANT i64 1 %2:vgpr(s64) = G_CONSTANT i64 -1 @@ -208,42 +184,34 @@ tracksRegLiveness: true body: | bb.0: ; WAVE64-LABEL: name: constant_s_s64 - ; WAVE64: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; WAVE64-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 1 - ; WAVE64-NEXT: [[S_MOV_B64_2:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 - ; WAVE64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967242 - ; WAVE64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; WAVE64: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0 + ; WAVE64-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1 + ; WAVE64-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1 + ; WAVE64-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -54 + ; WAVE64-NEXT: [[S_MOV_B4:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 27 + ; WAVE64-NEXT: [[S_MOV_B5:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967295 + ; WAVE64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; WAVE64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1 ; WAVE64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; WAVE64-NEXT: [[S_MOV_B64_3:%[0-9]+]]:sreg_64 = S_MOV_B64 27 - ; WAVE64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295 - ; WAVE64-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; WAVE64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 23255 + ; WAVE64-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -16 ; WAVE64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1 - ; WAVE64-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; WAVE64-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 1 - ; WAVE64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_4]], %subreg.sub0, [[S_MOV_B32_5]], %subreg.sub1 - ; WAVE64-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 23255 - ; WAVE64-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 -16 - ; WAVE64-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_6]], %subreg.sub0, [[S_MOV_B32_7]], %subreg.sub1 - ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_MOV_B64_]], implicit [[S_MOV_B64_1]], implicit [[S_MOV_B64_2]], implicit [[REG_SEQUENCE]], implicit [[S_MOV_B64_3]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]] + ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]], implicit [[S_MOV_B4]], implicit [[S_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]] ; ; WAVE32-LABEL: name: constant_s_s64 - ; WAVE32: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; WAVE32-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 1 - ; WAVE32-NEXT: [[S_MOV_B64_2:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 - ; WAVE32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967242 - ; WAVE32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; WAVE32: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0 + ; WAVE32-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1 + ; WAVE32-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1 + ; WAVE32-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -54 + ; WAVE32-NEXT: [[S_MOV_B4:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 27 + ; WAVE32-NEXT: [[S_MOV_B5:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967295 + ; WAVE32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; WAVE32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1 ; WAVE32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; WAVE32-NEXT: [[S_MOV_B64_3:%[0-9]+]]:sreg_64 = S_MOV_B64 27 - ; WAVE32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295 - ; WAVE32-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; WAVE32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 23255 + ; WAVE32-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -16 ; WAVE32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1 - ; WAVE32-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; WAVE32-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 1 - ; WAVE32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_4]], %subreg.sub0, [[S_MOV_B32_5]], %subreg.sub1 - ; WAVE32-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 23255 - ; WAVE32-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 -16 - ; WAVE32-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_6]], %subreg.sub0, [[S_MOV_B32_7]], %subreg.sub1 - ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_MOV_B64_]], implicit [[S_MOV_B64_1]], implicit [[S_MOV_B64_2]], implicit [[REG_SEQUENCE]], implicit [[S_MOV_B64_3]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]] + ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]], implicit [[S_MOV_B4]], implicit [[S_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]] %0:sgpr(s64) = G_CONSTANT i64 0 %1:sgpr(s64) = G_CONSTANT i64 1 %2:sgpr(s64) = G_CONSTANT i64 -1 @@ -351,42 +319,34 @@ tracksRegLiveness: true body: | bb.0: ; WAVE64-LABEL: name: constant_s_p1 - ; WAVE64: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; WAVE64-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 1 - ; WAVE64-NEXT: [[S_MOV_B64_2:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 - ; WAVE64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967242 - ; WAVE64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; WAVE64: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0 + ; WAVE64-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1 + ; WAVE64-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1 + ; WAVE64-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -54 + ; WAVE64-NEXT: [[S_MOV_B4:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 27 + ; WAVE64-NEXT: [[S_MOV_B5:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967295 + ; WAVE64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; WAVE64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1 ; WAVE64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; WAVE64-NEXT: [[S_MOV_B64_3:%[0-9]+]]:sreg_64 = S_MOV_B64 27 - ; WAVE64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295 - ; WAVE64-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; WAVE64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 23255 + ; WAVE64-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -16 ; WAVE64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1 - ; WAVE64-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; WAVE64-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 1 - ; WAVE64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_4]], %subreg.sub0, [[S_MOV_B32_5]], %subreg.sub1 - ; WAVE64-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 23255 - ; WAVE64-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 -16 - ; WAVE64-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_6]], %subreg.sub0, [[S_MOV_B32_7]], %subreg.sub1 - ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_MOV_B64_]], implicit [[S_MOV_B64_1]], implicit [[S_MOV_B64_2]], implicit [[REG_SEQUENCE]], implicit [[S_MOV_B64_3]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]] + ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]], implicit [[S_MOV_B4]], implicit [[S_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]] ; ; WAVE32-LABEL: name: constant_s_p1 - ; WAVE32: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; WAVE32-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 1 - ; WAVE32-NEXT: [[S_MOV_B64_2:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 - ; WAVE32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967242 - ; WAVE32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; WAVE32: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0 + ; WAVE32-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1 + ; WAVE32-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1 + ; WAVE32-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -54 + ; WAVE32-NEXT: [[S_MOV_B4:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 27 + ; WAVE32-NEXT: [[S_MOV_B5:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967295 + ; WAVE32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; WAVE32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1 ; WAVE32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; WAVE32-NEXT: [[S_MOV_B64_3:%[0-9]+]]:sreg_64 = S_MOV_B64 27 - ; WAVE32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295 - ; WAVE32-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; WAVE32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 23255 + ; WAVE32-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -16 ; WAVE32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1 - ; WAVE32-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; WAVE32-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 1 - ; WAVE32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_4]], %subreg.sub0, [[S_MOV_B32_5]], %subreg.sub1 - ; WAVE32-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 23255 - ; WAVE32-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 -16 - ; WAVE32-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_6]], %subreg.sub0, [[S_MOV_B32_7]], %subreg.sub1 - ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_MOV_B64_]], implicit [[S_MOV_B64_1]], implicit [[S_MOV_B64_2]], implicit [[REG_SEQUENCE]], implicit [[S_MOV_B64_3]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]] + ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]], implicit [[S_MOV_B4]], implicit [[S_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]] %0:sgpr(p1) = G_CONSTANT i64 0 %1:sgpr(p1) = G_CONSTANT i64 1 %2:sgpr(p1) = G_CONSTANT i64 -1 @@ -407,58 +367,34 @@ tracksRegLiveness: true body: | bb.0: ; WAVE64-LABEL: name: constant_v_p1 - ; WAVE64: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; WAVE64-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; WAVE64: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec + ; WAVE64-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec + ; WAVE64-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -1, implicit $exec + ; WAVE64-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -54, implicit $exec + ; WAVE64-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 27, implicit $exec + ; WAVE64-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967295, implicit $exec + ; WAVE64-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; WAVE64-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; WAVE64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 - ; WAVE64-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; WAVE64-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; WAVE64-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec + ; WAVE64-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec ; WAVE64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_2]], %subreg.sub0, [[V_MOV_B32_e32_3]], %subreg.sub1 - ; WAVE64-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec - ; WAVE64-NEXT: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; WAVE64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_4]], %subreg.sub0, [[V_MOV_B32_e32_5]], %subreg.sub1 - ; WAVE64-NEXT: [[V_MOV_B32_e32_6:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967242, implicit $exec - ; WAVE64-NEXT: [[V_MOV_B32_e32_7:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; WAVE64-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_6]], %subreg.sub0, [[V_MOV_B32_e32_7]], %subreg.sub1 - ; WAVE64-NEXT: [[V_MOV_B32_e32_8:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec - ; WAVE64-NEXT: [[V_MOV_B32_e32_9:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; WAVE64-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_8]], %subreg.sub0, [[V_MOV_B32_e32_9]], %subreg.sub1 - ; WAVE64-NEXT: [[V_MOV_B32_e32_10:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec - ; WAVE64-NEXT: [[V_MOV_B32_e32_11:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; WAVE64-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_10]], %subreg.sub0, [[V_MOV_B32_e32_11]], %subreg.sub1 - ; WAVE64-NEXT: [[V_MOV_B32_e32_12:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; WAVE64-NEXT: [[V_MOV_B32_e32_13:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; WAVE64-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_12]], %subreg.sub0, [[V_MOV_B32_e32_13]], %subreg.sub1 - ; WAVE64-NEXT: [[V_MOV_B32_e32_14:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec - ; WAVE64-NEXT: [[V_MOV_B32_e32_15:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec - ; WAVE64-NEXT: [[REG_SEQUENCE7:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_14]], %subreg.sub0, [[V_MOV_B32_e32_15]], %subreg.sub1 - ; WAVE64-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]], implicit [[REG_SEQUENCE4]], implicit [[REG_SEQUENCE5]], implicit [[REG_SEQUENCE6]], implicit [[REG_SEQUENCE7]] + ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]] ; ; WAVE32-LABEL: name: constant_v_p1 - ; WAVE32: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; WAVE32-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; WAVE32: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec + ; WAVE32-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec + ; WAVE32-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -1, implicit $exec + ; WAVE32-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -54, implicit $exec + ; WAVE32-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 27, implicit $exec + ; WAVE32-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967295, implicit $exec + ; WAVE32-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; WAVE32-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; WAVE32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 - ; WAVE32-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; WAVE32-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; WAVE32-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec + ; WAVE32-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec ; WAVE32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_2]], %subreg.sub0, [[V_MOV_B32_e32_3]], %subreg.sub1 - ; WAVE32-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec - ; WAVE32-NEXT: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; WAVE32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_4]], %subreg.sub0, [[V_MOV_B32_e32_5]], %subreg.sub1 - ; WAVE32-NEXT: [[V_MOV_B32_e32_6:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967242, implicit $exec - ; WAVE32-NEXT: [[V_MOV_B32_e32_7:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; WAVE32-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_6]], %subreg.sub0, [[V_MOV_B32_e32_7]], %subreg.sub1 - ; WAVE32-NEXT: [[V_MOV_B32_e32_8:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec - ; WAVE32-NEXT: [[V_MOV_B32_e32_9:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; WAVE32-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_8]], %subreg.sub0, [[V_MOV_B32_e32_9]], %subreg.sub1 - ; WAVE32-NEXT: [[V_MOV_B32_e32_10:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec - ; WAVE32-NEXT: [[V_MOV_B32_e32_11:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; WAVE32-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_10]], %subreg.sub0, [[V_MOV_B32_e32_11]], %subreg.sub1 - ; WAVE32-NEXT: [[V_MOV_B32_e32_12:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; WAVE32-NEXT: [[V_MOV_B32_e32_13:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; WAVE32-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_12]], %subreg.sub0, [[V_MOV_B32_e32_13]], %subreg.sub1 - ; WAVE32-NEXT: [[V_MOV_B32_e32_14:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec - ; WAVE32-NEXT: [[V_MOV_B32_e32_15:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec - ; WAVE32-NEXT: [[REG_SEQUENCE7:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_14]], %subreg.sub0, [[V_MOV_B32_e32_15]], %subreg.sub1 - ; WAVE32-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]], implicit [[REG_SEQUENCE4]], implicit [[REG_SEQUENCE5]], implicit [[REG_SEQUENCE6]], implicit [[REG_SEQUENCE7]] + ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]] %0:vgpr(p1) = G_CONSTANT i64 0 %1:vgpr(p1) = G_CONSTANT i64 1 %2:vgpr(p1) = G_CONSTANT i64 -1 @@ -479,42 +415,34 @@ tracksRegLiveness: true body: | bb.0: ; WAVE64-LABEL: name: constant_s_p999 - ; WAVE64: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; WAVE64-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 1 - ; WAVE64-NEXT: [[S_MOV_B64_2:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 - ; WAVE64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967242 - ; WAVE64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; WAVE64: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0 + ; WAVE64-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1 + ; WAVE64-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1 + ; WAVE64-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -54 + ; WAVE64-NEXT: [[S_MOV_B4:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 27 + ; WAVE64-NEXT: [[S_MOV_B5:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967295 + ; WAVE64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; WAVE64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1 ; WAVE64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; WAVE64-NEXT: [[S_MOV_B64_3:%[0-9]+]]:sreg_64 = S_MOV_B64 27 - ; WAVE64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295 - ; WAVE64-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; WAVE64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 23255 + ; WAVE64-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -16 ; WAVE64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1 - ; WAVE64-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; WAVE64-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 1 - ; WAVE64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_4]], %subreg.sub0, [[S_MOV_B32_5]], %subreg.sub1 - ; WAVE64-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 23255 - ; WAVE64-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 -16 - ; WAVE64-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_6]], %subreg.sub0, [[S_MOV_B32_7]], %subreg.sub1 - ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_MOV_B64_]], implicit [[S_MOV_B64_1]], implicit [[S_MOV_B64_2]], implicit [[REG_SEQUENCE]], implicit [[S_MOV_B64_3]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]] + ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]], implicit [[S_MOV_B4]], implicit [[S_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]] ; ; WAVE32-LABEL: name: constant_s_p999 - ; WAVE32: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; WAVE32-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 1 - ; WAVE32-NEXT: [[S_MOV_B64_2:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 - ; WAVE32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967242 - ; WAVE32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; WAVE32: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0 + ; WAVE32-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1 + ; WAVE32-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1 + ; WAVE32-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -54 + ; WAVE32-NEXT: [[S_MOV_B4:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 27 + ; WAVE32-NEXT: [[S_MOV_B5:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967295 + ; WAVE32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; WAVE32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1 ; WAVE32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; WAVE32-NEXT: [[S_MOV_B64_3:%[0-9]+]]:sreg_64 = S_MOV_B64 27 - ; WAVE32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295 - ; WAVE32-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; WAVE32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 23255 + ; WAVE32-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -16 ; WAVE32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1 - ; WAVE32-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; WAVE32-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 1 - ; WAVE32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_4]], %subreg.sub0, [[S_MOV_B32_5]], %subreg.sub1 - ; WAVE32-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 23255 - ; WAVE32-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 -16 - ; WAVE32-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_6]], %subreg.sub0, [[S_MOV_B32_7]], %subreg.sub1 - ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_MOV_B64_]], implicit [[S_MOV_B64_1]], implicit [[S_MOV_B64_2]], implicit [[REG_SEQUENCE]], implicit [[S_MOV_B64_3]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]] + ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]], implicit [[S_MOV_B4]], implicit [[S_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]] %0:sgpr(p999) = G_CONSTANT i64 0 %1:sgpr(p999) = G_CONSTANT i64 1 %2:sgpr(p999) = G_CONSTANT i64 -1 @@ -535,58 +463,34 @@ tracksRegLiveness: true body: | bb.0: ; WAVE64-LABEL: name: constant_v_p999 - ; WAVE64: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; WAVE64-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; WAVE64: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec + ; WAVE64-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec + ; WAVE64-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -1, implicit $exec + ; WAVE64-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -54, implicit $exec + ; WAVE64-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 27, implicit $exec + ; WAVE64-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967295, implicit $exec + ; WAVE64-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; WAVE64-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; WAVE64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 - ; WAVE64-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; WAVE64-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; WAVE64-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec + ; WAVE64-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec ; WAVE64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_2]], %subreg.sub0, [[V_MOV_B32_e32_3]], %subreg.sub1 - ; WAVE64-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec - ; WAVE64-NEXT: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; WAVE64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_4]], %subreg.sub0, [[V_MOV_B32_e32_5]], %subreg.sub1 - ; WAVE64-NEXT: [[V_MOV_B32_e32_6:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967242, implicit $exec - ; WAVE64-NEXT: [[V_MOV_B32_e32_7:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; WAVE64-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_6]], %subreg.sub0, [[V_MOV_B32_e32_7]], %subreg.sub1 - ; WAVE64-NEXT: [[V_MOV_B32_e32_8:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec - ; WAVE64-NEXT: [[V_MOV_B32_e32_9:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; WAVE64-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_8]], %subreg.sub0, [[V_MOV_B32_e32_9]], %subreg.sub1 - ; WAVE64-NEXT: [[V_MOV_B32_e32_10:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec - ; WAVE64-NEXT: [[V_MOV_B32_e32_11:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; WAVE64-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_10]], %subreg.sub0, [[V_MOV_B32_e32_11]], %subreg.sub1 - ; WAVE64-NEXT: [[V_MOV_B32_e32_12:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; WAVE64-NEXT: [[V_MOV_B32_e32_13:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; WAVE64-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_12]], %subreg.sub0, [[V_MOV_B32_e32_13]], %subreg.sub1 - ; WAVE64-NEXT: [[V_MOV_B32_e32_14:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec - ; WAVE64-NEXT: [[V_MOV_B32_e32_15:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec - ; WAVE64-NEXT: [[REG_SEQUENCE7:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_14]], %subreg.sub0, [[V_MOV_B32_e32_15]], %subreg.sub1 - ; WAVE64-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]], implicit [[REG_SEQUENCE4]], implicit [[REG_SEQUENCE5]], implicit [[REG_SEQUENCE6]], implicit [[REG_SEQUENCE7]] + ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]] ; ; WAVE32-LABEL: name: constant_v_p999 - ; WAVE32: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; WAVE32-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; WAVE32: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec + ; WAVE32-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec + ; WAVE32-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -1, implicit $exec + ; WAVE32-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -54, implicit $exec + ; WAVE32-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 27, implicit $exec + ; WAVE32-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967295, implicit $exec + ; WAVE32-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; WAVE32-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; WAVE32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 - ; WAVE32-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; WAVE32-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; WAVE32-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec + ; WAVE32-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec ; WAVE32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_2]], %subreg.sub0, [[V_MOV_B32_e32_3]], %subreg.sub1 - ; WAVE32-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec - ; WAVE32-NEXT: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; WAVE32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_4]], %subreg.sub0, [[V_MOV_B32_e32_5]], %subreg.sub1 - ; WAVE32-NEXT: [[V_MOV_B32_e32_6:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967242, implicit $exec - ; WAVE32-NEXT: [[V_MOV_B32_e32_7:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; WAVE32-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_6]], %subreg.sub0, [[V_MOV_B32_e32_7]], %subreg.sub1 - ; WAVE32-NEXT: [[V_MOV_B32_e32_8:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec - ; WAVE32-NEXT: [[V_MOV_B32_e32_9:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; WAVE32-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_8]], %subreg.sub0, [[V_MOV_B32_e32_9]], %subreg.sub1 - ; WAVE32-NEXT: [[V_MOV_B32_e32_10:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec - ; WAVE32-NEXT: [[V_MOV_B32_e32_11:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; WAVE32-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_10]], %subreg.sub0, [[V_MOV_B32_e32_11]], %subreg.sub1 - ; WAVE32-NEXT: [[V_MOV_B32_e32_12:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; WAVE32-NEXT: [[V_MOV_B32_e32_13:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; WAVE32-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_12]], %subreg.sub0, [[V_MOV_B32_e32_13]], %subreg.sub1 - ; WAVE32-NEXT: [[V_MOV_B32_e32_14:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec - ; WAVE32-NEXT: [[V_MOV_B32_e32_15:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec - ; WAVE32-NEXT: [[REG_SEQUENCE7:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_14]], %subreg.sub0, [[V_MOV_B32_e32_15]], %subreg.sub1 - ; WAVE32-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]], implicit [[REG_SEQUENCE4]], implicit [[REG_SEQUENCE5]], implicit [[REG_SEQUENCE6]], implicit [[REG_SEQUENCE7]] + ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]] %0:vgpr(p999) = G_CONSTANT i64 0 %1:vgpr(p999) = G_CONSTANT i64 1 %2:vgpr(p999) = G_CONSTANT i64 -1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fconstant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fconstant.mir index 23b10218cbbe8..2465c374cc11d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fconstant.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fconstant.mir @@ -61,21 +61,13 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: fconstant_v_s64 - ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1072693248, implicit $exec - ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 - ; GCN-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GCN-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1075838976, implicit $exec - ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_2]], %subreg.sub0, [[V_MOV_B32_e32_3]], %subreg.sub1 - ; GCN-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GCN-NEXT: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1073741824, implicit $exec - ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_4]], %subreg.sub0, [[V_MOV_B32_e32_5]], %subreg.sub1 - ; GCN-NEXT: [[V_MOV_B32_e32_6:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GCN-NEXT: [[V_MOV_B32_e32_7:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1076101120, implicit $exec - ; GCN-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_6]], %subreg.sub0, [[V_MOV_B32_e32_7]], %subreg.sub1 - ; GCN-NEXT: $vgpr0_vgpr1 = COPY [[REG_SEQUENCE]] - ; GCN-NEXT: $vgpr2_vgpr3 = COPY [[REG_SEQUENCE1]] - ; GCN-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]] + ; GCN: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4607182418800017408, implicit $exec + ; GCN-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4620693217682128896, implicit $exec + ; GCN-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4611686018427387904, implicit $exec + ; GCN-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4621819117588971520, implicit $exec + ; GCN-NEXT: $vgpr0_vgpr1 = COPY [[V_MOV_B]] + ; GCN-NEXT: $vgpr2_vgpr3 = COPY [[V_MOV_B1]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MOV_B2]], implicit [[V_MOV_B3]] %0:vgpr(s64) = G_FCONSTANT double 1.0 %1:vgpr(s64) = G_FCONSTANT double 8.0 %2:vgpr(s64) = G_FCONSTANT double -2.0 @@ -95,17 +87,13 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: fconstant_s_s64 - ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 4607182418800017408 - ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1075838976 - ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GCN-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 -4611686018427387904 - ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GCN-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -1071382528 - ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1 - ; GCN-NEXT: $sgpr0_sgpr1 = COPY [[S_MOV_B64_]] - ; GCN-NEXT: $sgpr2_sgpr3 = COPY [[REG_SEQUENCE]] - ; GCN-NEXT: S_ENDPGM 0, implicit [[S_MOV_B64_]], implicit [[REG_SEQUENCE]], implicit [[S_MOV_B64_1]], implicit [[REG_SEQUENCE1]] + ; GCN: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4607182418800017408 + ; GCN-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4620693217682128896 + ; GCN-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -4611686018427387904 + ; GCN-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -4601552919265804288 + ; GCN-NEXT: $sgpr0_sgpr1 = COPY [[S_MOV_B]] + ; GCN-NEXT: $sgpr2_sgpr3 = COPY [[S_MOV_B1]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]] %0:sgpr(s64) = G_FCONSTANT double 1.0 %1:sgpr(s64) = G_FCONSTANT double 8.0 %2:sgpr(s64) = G_FCONSTANT double -2.0 @@ -136,7 +124,6 @@ body: | %2:vgpr(s32) = G_ANYEXT %0 %3:vgpr(s32) = G_ANYEXT %1 - ; Test without already assigned register class %4:vgpr(s16) = G_FCONSTANT half 1.0 %5:vgpr(s16) = G_FCONSTANT half 8.0 $vgpr0 = COPY %2 @@ -168,7 +155,6 @@ body: | %2:vgpr(s32) = G_ANYEXT %0 %3:vgpr(s32) = G_ANYEXT %1 - ; Test without already assigned register class %4:sgpr(s16) = G_FCONSTANT half 1.0 %5:sgpr(s16) = G_FCONSTANT half 8.0 $sgpr0 = COPY %2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.mir index 15ece434487ed..6ccd2a9c3e678 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.mir @@ -17,24 +17,21 @@ body: | ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GCN-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 - ; GCN-NEXT: %4:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %5:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %6:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: FLAT_STORE_DWORD [[COPY3]], %4, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) - ; GCN-NEXT: FLAT_STORE_DWORD [[COPY3]], %5, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) - ; GCN-NEXT: FLAT_STORE_DWORD [[COPY3]], %6, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) + ; GCN-NEXT: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_MUL_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_MUL_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD [[COPY3]], [[V_MUL_F32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) + ; GCN-NEXT: FLAT_STORE_DWORD [[COPY3]], [[V_MUL_F32_e64_1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) + ; GCN-NEXT: FLAT_STORE_DWORD [[COPY3]], [[V_MUL_F32_e64_2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) %0:sgpr(s32) = COPY $sgpr0 %1:vgpr(s32) = COPY $vgpr0 %2:vgpr(s32) = COPY $vgpr1 %3:vgpr(p1) = COPY $vgpr3_vgpr4 - ; fmul vs %4:vgpr(s32) = G_FMUL %1, %0 - ; fmul sv %5:vgpr(s32) = G_FMUL %0, %1 - ; fmul vv %6:vgpr(s32) = G_FMUL %1, %2 G_STORE %4, %3 :: (store (s32), addrspace 1) @@ -57,22 +54,19 @@ body: | ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GCN-NEXT: %4:vreg_64 = nofpexcept V_MUL_F64_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %5:vreg_64 = nofpexcept V_MUL_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %6:vreg_64 = nofpexcept V_MUL_F64_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_ENDPGM 0, implicit %4, implicit %5, implicit %6 + ; GCN-NEXT: [[V_MUL_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_MUL_F64_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_MUL_F64_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_MUL_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_MUL_F64_e64_2:%[0-9]+]]:vreg_64 = nofpexcept V_MUL_F64_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MUL_F64_e64_]], implicit [[V_MUL_F64_e64_1]], implicit [[V_MUL_F64_e64_2]] %0:sgpr(s64) = COPY $sgpr0_sgpr1 %1:vgpr(s64) = COPY $vgpr0_vgpr1 %2:vgpr(s64) = COPY $vgpr2_vgpr3 %3:vgpr(p1) = COPY $vgpr4_vgpr5 - ; fmul vs %4:vgpr(s64) = G_FMUL %1, %0 - ; fmul sv %5:vgpr(s64) = G_FMUL %0, %1 - ; fmul vv %6:vgpr(s64) = G_FMUL %1, %2 S_ENDPGM 0, implicit %4, implicit %5, implicit %6 @@ -92,10 +86,10 @@ body: | ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GCN-NEXT: %7:vgpr_32 = nofpexcept V_MUL_F16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %8:vgpr_32 = nofpexcept V_MUL_F16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %9:vgpr_32 = nofpexcept V_MUL_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_ENDPGM 0, implicit %7, implicit %8, implicit %9 + ; GCN-NEXT: [[V_MUL_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_MUL_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_MUL_F16_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MUL_F16_e64_]], implicit [[V_MUL_F16_e64_1]], implicit [[V_MUL_F16_e64_2]] %0:sgpr(s32) = COPY $sgpr0 %1:vgpr(s32) = COPY $vgpr0 %2:vgpr(s32) = COPY $vgpr1 @@ -105,13 +99,10 @@ body: | %5:vgpr(s16) = G_TRUNC %1 %6:vgpr(s16) = G_TRUNC %2 - ; fmul vs %8:vgpr(s16) = G_FMUL %4, %4 - ; fmul sv %9:vgpr(s16) = G_FMUL %4, %4 - ; fmul vv %10:vgpr(s16) = G_FMUL %4, %5 S_ENDPGM 0, implicit %8, implicit %9, implicit %10 @@ -131,26 +122,26 @@ body: | ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GCN-NEXT: %6:vgpr_32 = nofpexcept V_MUL_F32_e64 2, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %7:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY]], 2, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %8:vgpr_32 = nofpexcept V_MUL_F32_e64 2, [[COPY]], 2, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %9:vgpr_32 = nofpexcept V_MUL_F32_e64 1, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %10:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY]], 1, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %11:vgpr_32 = nofpexcept V_MUL_F32_e64 1, [[COPY]], 1, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %12:vgpr_32 = nofpexcept V_MUL_F32_e64 3, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %13:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY]], 3, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %14:vgpr_32 = nofpexcept V_MUL_F32_e64 3, [[COPY]], 3, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %15:vgpr_32 = nofpexcept V_MUL_F32_e64 3, [[COPY]], 1, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], %6, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) - ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], %7, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) - ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], %8, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) - ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], %9, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) - ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], %10, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) - ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], %11, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) - ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], %12, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) - ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], %13, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) - ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], %14, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) - ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], %15, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) + ; GCN-NEXT: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 2, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_MUL_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY]], 2, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_MUL_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 2, [[COPY]], 2, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_MUL_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 1, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_MUL_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY]], 1, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_MUL_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 1, [[COPY]], 1, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_MUL_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 3, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_MUL_F32_e64_7:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY]], 3, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_MUL_F32_e64_8:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 3, [[COPY]], 3, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_MUL_F32_e64_9:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 3, [[COPY]], 1, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], [[V_MUL_F32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) + ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], [[V_MUL_F32_e64_1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) + ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], [[V_MUL_F32_e64_2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) + ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], [[V_MUL_F32_e64_3]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) + ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], [[V_MUL_F32_e64_4]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) + ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], [[V_MUL_F32_e64_5]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) + ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], [[V_MUL_F32_e64_6]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) + ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], [[V_MUL_F32_e64_7]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) + ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], [[V_MUL_F32_e64_8]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) + ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], [[V_MUL_F32_e64_9]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(p1) = COPY $vgpr2_vgpr3 @@ -159,37 +150,27 @@ body: | %4:vgpr(s32) = G_FNEG %0 %5:vgpr(s32) = G_FNEG %3 - ; fabs lhs %6:vgpr(s32) = G_FMUL %3, %0 - ; fabs rhs %7:vgpr(s32) = G_FMUL %0, %3 - ; fabs lhs, rhs %8:vgpr(s32) = G_FMUL %3, %3 - ; fneg lhs %9:vgpr(s32) = G_FMUL %4, %0 - ; fneg rhs %10:vgpr(s32) = G_FMUL %0, %4 - ; fneg lhs, rhs %11:vgpr(s32) = G_FMUL %4, %4 - ; fneg fabs lhs %12:vgpr(s32) = G_FMUL %5, %0 - ; fneg fabs rhs %13:vgpr(s32) = G_FMUL %0, %5 - ; fneg fabs lhs, rhs %14:vgpr(s32) = G_FMUL %5, %5 - ; fneg fabs lhs, fneg rhs %15:vgpr(s32) = G_FMUL %5, %4 G_STORE %6, %2 :: (store (s32), addrspace 1) @@ -369,10 +350,8 @@ body: | ; GCN: liveins: $vgpr0_vgpr1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1070596096, implicit $exec - ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 - ; GCN-NEXT: [[V_MUL_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_MUL_F64_e64 3, [[COPY]], 1, [[REG_SEQUENCE]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4598175219545276416, implicit $exec + ; GCN-NEXT: [[V_MUL_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_MUL_F64_e64 3, [[COPY]], 1, [[V_MOV_B]], 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: $vgpr0_vgpr1 = COPY [[V_MUL_F64_e64_]] ; GCN-NEXT: SI_RETURN implicit $vgpr0_vgpr1 %0:vgpr(s64) = COPY $vgpr0_vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir index cda4414a0d90e..47e5a2f35a567 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir @@ -20,10 +20,8 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub2_sub3 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1) - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; CHECK-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]] ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] ; CHECK-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 1, [[COPY4]], 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[V_FLOOR_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_FLOOR_F64_e64 0, [[V_ADD_F64_e64_]], 0, 0, implicit $mode, implicit $exec @@ -62,7 +60,6 @@ body: | bb.1: liveins: $sgpr0_sgpr1 - ; S_LOAD_DWORDX4_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4) ; CHECK-LABEL: name: fract_f64_neg_abs ; CHECK: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} @@ -71,10 +68,8 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub2_sub3 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1) - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; CHECK-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]] ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] ; CHECK-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 3, [[COPY4]], 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[V_FLOOR_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_FLOOR_F64_e64 0, [[V_ADD_F64_e64_]], 0, 0, implicit $mode, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir index c9b1b782658c7..de21788ff168f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir @@ -21,18 +21,21 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32)) ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] + ; ; GFX9-LABEL: name: load_atomic_flat_s32_seq_cst ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32)) ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] + ; ; GFX10-LABEL: name: load_atomic_flat_s32_seq_cst ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32)) ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] + ; ; GFX11-LABEL: name: load_atomic_flat_s32_seq_cst ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -62,18 +65,21 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s16>)) ; GFX7-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; ; GFX9-LABEL: name: load_atomic_flat_v2s16_seq_cst ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s16>)) ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; ; GFX10-LABEL: name: load_atomic_flat_v2s16_seq_cst ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s16>)) ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; ; GFX11-LABEL: name: load_atomic_flat_v2s16_seq_cst ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -103,18 +109,21 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(p3) = G_LOAD [[COPY]](p0) :: (load seq_cst (p3)) ; GFX7-NEXT: $vgpr0 = COPY [[LOAD]](p3) + ; ; GFX9-LABEL: name: load_atomic_flat_p3_seq_cst ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(p3) = G_LOAD [[COPY]](p0) :: (load seq_cst (p3)) ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](p3) + ; ; GFX10-LABEL: name: load_atomic_flat_p3_seq_cst ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(p3) = G_LOAD [[COPY]](p0) :: (load seq_cst (p3)) ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](p3) + ; ; GFX11-LABEL: name: load_atomic_flat_p3_seq_cst ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -144,18 +153,21 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s64)) ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] + ; ; GFX9-LABEL: name: load_atomic_flat_s64_seq_cst ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s64)) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] + ; ; GFX10-LABEL: name: load_atomic_flat_s64_seq_cst ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s64)) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] + ; ; GFX11-LABEL: name: load_atomic_flat_s64_seq_cst ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -185,18 +197,21 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s32>)) ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; ; GFX9-LABEL: name: load_atomic_flat_v2s32_seq_cst ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s32>)) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; ; GFX10-LABEL: name: load_atomic_flat_v2s32_seq_cst ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s32>)) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; ; GFX11-LABEL: name: load_atomic_flat_v2s32_seq_cst ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -226,18 +241,21 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<4 x s16>)) ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; ; GFX9-LABEL: name: load_atomic_flat_v4s16_seq_cst ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<4 x s16>)) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; ; GFX10-LABEL: name: load_atomic_flat_v4s16_seq_cst ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<4 x s16>)) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; ; GFX11-LABEL: name: load_atomic_flat_v4s16_seq_cst ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -267,18 +285,21 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p0) :: (load seq_cst (p1)) ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) + ; ; GFX9-LABEL: name: load_atomic_flat_p1_seq_cst ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p0) :: (load seq_cst (p1)) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) + ; ; GFX10-LABEL: name: load_atomic_flat_p1_seq_cst ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p0) :: (load seq_cst (p1)) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) + ; ; GFX11-LABEL: name: load_atomic_flat_p1_seq_cst ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -308,18 +329,21 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p0) = G_LOAD [[COPY]](p0) :: (load seq_cst (p0)) ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p0) + ; ; GFX9-LABEL: name: load_atomic_flat_p0_seq_cst ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p0) = G_LOAD [[COPY]](p0) :: (load seq_cst (p0)) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p0) + ; ; GFX10-LABEL: name: load_atomic_flat_p0_seq_cst ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p0) = G_LOAD [[COPY]](p0) :: (load seq_cst (p0)) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p0) + ; ; GFX11-LABEL: name: load_atomic_flat_p0_seq_cst ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -347,65 +371,60 @@ body: | ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32)) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32)) ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] + ; ; GFX9-LABEL: name: load_atomic_flat_s32_seq_cst_gep_m2048 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec - ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX9-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32)) + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX9-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32)) ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] + ; ; GFX10-LABEL: name: load_atomic_flat_s32_seq_cst_gep_m2048 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32)) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32)) ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] + ; ; GFX11-LABEL: name: load_atomic_flat_s32_seq_cst_gep_m2048 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec - ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX11-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32)) + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX11-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32)) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 -2048 @@ -430,40 +449,39 @@ body: | ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32)) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32)) ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] + ; ; GFX9-LABEL: name: load_atomic_flat_s32_seq_cst_gep_4095 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 4095, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32)) ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] + ; ; GFX10-LABEL: name: load_atomic_flat_s32_seq_cst_gep_4095 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32)) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32)) ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] + ; ; GFX11-LABEL: name: load_atomic_flat_s32_seq_cst_gep_4095 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir index 2c2e792bce66d..b678966de8537 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir @@ -28,6 +28,7 @@ body: | ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load seq_cst (s32), addrspace 1) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]] + ; ; GFX7-LABEL: name: load_atomic_global_s32_seq_cst ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} @@ -39,18 +40,21 @@ body: | ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load seq_cst (s32), addrspace 1) ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]] + ; ; GFX7-FLAT-LABEL: name: load_atomic_global_s32_seq_cst ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] + ; ; GFX9-LABEL: name: load_atomic_global_s32_seq_cst ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load seq_cst (s32), addrspace 1) ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + ; ; GFX10-LABEL: name: load_atomic_global_s32_seq_cst ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} @@ -80,24 +84,28 @@ body: | ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<2 x s16>), addrspace 1) ; GFX6-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; ; GFX7-LABEL: name: load_atomic_global_v2s16_seq_cst ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<2 x s16>), addrspace 1) ; GFX7-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; ; GFX7-FLAT-LABEL: name: load_atomic_global_v2s16_seq_cst ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<2 x s16>), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; ; GFX9-LABEL: name: load_atomic_global_v2s16_seq_cst ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<2 x s16>), addrspace 1) ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; ; GFX10-LABEL: name: load_atomic_global_v2s16_seq_cst ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} @@ -127,24 +135,28 @@ body: | ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(p3) = G_LOAD [[COPY]](p1) :: (load seq_cst (p3), addrspace 1) ; GFX6-NEXT: $vgpr0 = COPY [[LOAD]](p3) + ; ; GFX7-LABEL: name: load_atomic_global_p3_seq_cst ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(p3) = G_LOAD [[COPY]](p1) :: (load seq_cst (p3), addrspace 1) ; GFX7-NEXT: $vgpr0 = COPY [[LOAD]](p3) + ; ; GFX7-FLAT-LABEL: name: load_atomic_global_p3_seq_cst ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(p3) = G_LOAD [[COPY]](p1) :: (load seq_cst (p3), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[LOAD]](p3) + ; ; GFX9-LABEL: name: load_atomic_global_p3_seq_cst ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(p3) = G_LOAD [[COPY]](p1) :: (load seq_cst (p3), addrspace 1) ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](p3) + ; ; GFX10-LABEL: name: load_atomic_global_p3_seq_cst ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} @@ -179,6 +191,7 @@ body: | ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load seq_cst (s64), addrspace 1) ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]] + ; ; GFX7-LABEL: name: load_atomic_global_s64_seq_cst ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} @@ -190,18 +203,21 @@ body: | ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load seq_cst (s64), addrspace 1) ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]] + ; ; GFX7-FLAT-LABEL: name: load_atomic_global_s64_seq_cst ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s64), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] + ; ; GFX9-LABEL: name: load_atomic_global_s64_seq_cst ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load seq_cst (s64), addrspace 1) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] + ; ; GFX10-LABEL: name: load_atomic_global_s64_seq_cst ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} @@ -231,24 +247,28 @@ body: | ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<2 x s32>), addrspace 1) ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; ; GFX7-LABEL: name: load_atomic_global_v2s32_seq_cst ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<2 x s32>), addrspace 1) ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; ; GFX7-FLAT-LABEL: name: load_atomic_global_v2s32_seq_cst ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<2 x s32>), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; ; GFX9-LABEL: name: load_atomic_global_v2s32_seq_cst ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<2 x s32>), addrspace 1) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; ; GFX10-LABEL: name: load_atomic_global_v2s32_seq_cst ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} @@ -278,24 +298,28 @@ body: | ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<4 x s16>), addrspace 1) ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; ; GFX7-LABEL: name: load_atomic_global_v4s16_seq_cst ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<4 x s16>), addrspace 1) ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; ; GFX7-FLAT-LABEL: name: load_atomic_global_v4s16_seq_cst ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<4 x s16>), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; ; GFX9-LABEL: name: load_atomic_global_v4s16_seq_cst ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<4 x s16>), addrspace 1) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; ; GFX10-LABEL: name: load_atomic_global_v4s16_seq_cst ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} @@ -325,24 +349,28 @@ body: | ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p1) :: (load seq_cst (p1), addrspace 1) ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) + ; ; GFX7-LABEL: name: load_atomic_global_p1_seq_cst ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p1) :: (load seq_cst (p1), addrspace 1) ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) + ; ; GFX7-FLAT-LABEL: name: load_atomic_global_p1_seq_cst ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p1) :: (load seq_cst (p1), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) + ; ; GFX9-LABEL: name: load_atomic_global_p1_seq_cst ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p1) :: (load seq_cst (p1), addrspace 1) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) + ; ; GFX10-LABEL: name: load_atomic_global_p1_seq_cst ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} @@ -372,24 +400,28 @@ body: | ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p0) = G_LOAD [[COPY]](p1) :: (load seq_cst (p0), addrspace 1) ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p0) + ; ; GFX7-LABEL: name: load_atomic_global_p0_seq_cst ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p0) = G_LOAD [[COPY]](p1) :: (load seq_cst (p0), addrspace 1) ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p0) + ; ; GFX7-FLAT-LABEL: name: load_atomic_global_p0_seq_cst ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p0) = G_LOAD [[COPY]](p1) :: (load seq_cst (p0), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p0) + ; ; GFX9-LABEL: name: load_atomic_global_p0_seq_cst ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p0) = G_LOAD [[COPY]](p1) :: (load seq_cst (p0), addrspace 1) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p0) + ; ; GFX10-LABEL: name: load_atomic_global_p0_seq_cst ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} @@ -417,66 +449,64 @@ body: | ; GFX6: liveins: $vgpr0_vgpr1 ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec - ; GFX6-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 - ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load seq_cst (s32), addrspace 1) + ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 + ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load seq_cst (s32), addrspace 1) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]] + ; ; GFX7-LABEL: name: load_atomic_global_s32_seq_cst_gep_m2048 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 - ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load seq_cst (s32), addrspace 1) + ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 + ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load seq_cst (s32), addrspace 1) ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]] + ; ; GFX7-FLAT-LABEL: name: load_atomic_global_s32_seq_cst_gep_m2048 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec - ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1) + ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] + ; ; GFX9-LABEL: name: load_atomic_global_s32_seq_cst_gep_m2048 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], -2048, 0, implicit $exec :: (load seq_cst (s32), addrspace 1) ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + ; ; GFX10-LABEL: name: load_atomic_global_s32_seq_cst_gep_m2048 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} @@ -513,6 +543,7 @@ body: | ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, 0, implicit $exec :: (load seq_cst (s32), addrspace 1) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]] + ; ; GFX7-LABEL: name: load_atomic_global_s32_seq_cst_gep_4095 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} @@ -524,43 +555,42 @@ body: | ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, 0, implicit $exec :: (load seq_cst (s32), addrspace 1) ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]] + ; ; GFX7-FLAT-LABEL: name: load_atomic_global_s32_seq_cst_gep_4095 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec - ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1) + ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] + ; ; GFX9-LABEL: name: load_atomic_global_s32_seq_cst_gep_4095 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 4095, 0, implicit $exec :: (load seq_cst (s32), addrspace 1) ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + ; ; GFX10-LABEL: name: load_atomic_global_s32_seq_cst_gep_4095 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load seq_cst (s32), addrspace 1) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load seq_cst (s32), addrspace 1) ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 4095 @@ -585,66 +615,64 @@ body: | ; GFX6: liveins: $vgpr0_vgpr1 ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec - ; GFX6-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load seq_cst (s64), addrspace 1) + ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load seq_cst (s64), addrspace 1) ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]] + ; ; GFX7-LABEL: name: load_atomic_global_s64_seq_cst_gep_m2048 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load seq_cst (s64), addrspace 1) + ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load seq_cst (s64), addrspace 1) ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]] + ; ; GFX7-FLAT-LABEL: name: load_atomic_global_s64_seq_cst_gep_m2048 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec - ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s64), addrspace 1) + ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s64), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] + ; ; GFX9-LABEL: name: load_atomic_global_s64_seq_cst_gep_m2048 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], -2048, 0, implicit $exec :: (load seq_cst (s64), addrspace 1) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] + ; ; GFX10-LABEL: name: load_atomic_global_s64_seq_cst_gep_m2048 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir index 8f17cc3ab47ec..a53fd81f351a2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir @@ -1189,11 +1189,11 @@ body: | ; GFX6: liveins: $sgpr0_sgpr1 ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 + ; GFX6-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1 ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 - ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub0 + ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 - ; GFX6-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub1 + ; GFX6-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1 ; GFX6-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc ; GFX6-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 @@ -1204,11 +1204,11 @@ body: | ; GFX7: liveins: $sgpr0_sgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 + ; GFX7-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub0 + ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub1 + ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1 ; GFX7-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc ; GFX7-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 @@ -1219,11 +1219,11 @@ body: | ; GFX8: liveins: $sgpr0_sgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 + ; GFX8-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub0 + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub1 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1 ; GFX8-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc ; GFX8-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 @@ -1259,51 +1259,45 @@ body: | ; GFX6: liveins: $sgpr0_sgpr1 ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294443008 - ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -524288 ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 - ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 - ; GFX6-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX6-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1 ; GFX6-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc ; GFX6-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc - ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 - ; GFX6-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE1]], 0, 0 :: (load (s32), addrspace 4) + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; GFX6-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE]], 0, 0 :: (load (s32), addrspace 4) ; GFX6-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]] ; ; GFX7-LABEL: name: load_constant_s32_from_4_gep_negative_524288 ; GFX7: liveins: $sgpr0_sgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294443008 - ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX7-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -524288 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1 ; GFX7-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc ; GFX7-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 - ; GFX7-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE1]], 0, 0 :: (load (s32), addrspace 4) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; GFX7-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE]], 0, 0 :: (load (s32), addrspace 4) ; GFX7-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]] ; ; GFX8-LABEL: name: load_constant_s32_from_4_gep_negative_524288 ; GFX8: liveins: $sgpr0_sgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294443008 - ; GFX8-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX8-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -524288 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1 ; GFX8-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc ; GFX8-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc - ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 - ; GFX8-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE1]], 0, 0 :: (load (s32), addrspace 4) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; GFX8-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE]], 0, 0 :: (load (s32), addrspace 4) ; GFX8-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]] ; ; GFX10-LABEL: name: load_constant_s32_from_4_gep_negative_524288 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir index 78812ca1991f9..d7c3254398862 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir @@ -23,24 +23,28 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32)) ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] + ; ; GFX8-LABEL: name: load_flat_s32_from_4 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32)) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] + ; ; GFX9-LABEL: name: load_flat_s32_from_4 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32)) ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] + ; ; GFX10-LABEL: name: load_flat_s32_from_4 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32)) ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] + ; ; GFX11-LABEL: name: load_flat_s32_from_4 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -70,24 +74,28 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s16)) ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_USHORT]] + ; ; GFX8-LABEL: name: load_flat_s32_from_2 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s16)) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_USHORT]] + ; ; GFX9-LABEL: name: load_flat_s32_from_2 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s16)) ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_USHORT]] + ; ; GFX10-LABEL: name: load_flat_s32_from_2 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s16)) ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_USHORT]] + ; ; GFX11-LABEL: name: load_flat_s32_from_2 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -117,24 +125,28 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX8-LABEL: name: load_flat_s32_from_1 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX9-LABEL: name: load_flat_s32_from_1 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX10-LABEL: name: load_flat_s32_from_1 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX11-LABEL: name: load_flat_s32_from_1 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -164,24 +176,28 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s32>)) ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] + ; ; GFX8-LABEL: name: load_flat_v2s32 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s32>)) ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] + ; ; GFX9-LABEL: name: load_flat_v2s32 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s32>)) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] + ; ; GFX10-LABEL: name: load_flat_v2s32 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s32>)) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] + ; ; GFX11-LABEL: name: load_flat_v2s32 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -211,24 +227,28 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = FLAT_LOAD_DWORDX3 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<3 x s32>), align 4) ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[FLAT_LOAD_DWORDX3_]] + ; ; GFX8-LABEL: name: load_flat_v3s32 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = FLAT_LOAD_DWORDX3 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<3 x s32>), align 4) ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[FLAT_LOAD_DWORDX3_]] + ; ; GFX9-LABEL: name: load_flat_v3s32 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = FLAT_LOAD_DWORDX3 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<3 x s32>), align 4) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[FLAT_LOAD_DWORDX3_]] + ; ; GFX10-LABEL: name: load_flat_v3s32 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = FLAT_LOAD_DWORDX3 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<3 x s32>), align 4) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[FLAT_LOAD_DWORDX3_]] + ; ; GFX11-LABEL: name: load_flat_v3s32 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -258,24 +278,28 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s32>), align 4) ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] + ; ; GFX8-LABEL: name: load_flat_v4s32 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s32>), align 4) ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] + ; ; GFX9-LABEL: name: load_flat_v4s32 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s32>), align 4) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] + ; ; GFX10-LABEL: name: load_flat_v4s32 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s32>), align 4) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] + ; ; GFX11-LABEL: name: load_flat_v4s32 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -305,24 +329,28 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64)) ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] + ; ; GFX8-LABEL: name: load_flat_s64 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64)) ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] + ; ; GFX9-LABEL: name: load_flat_s64 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64)) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] + ; ; GFX10-LABEL: name: load_flat_s64 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64)) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] + ; ; GFX11-LABEL: name: load_flat_s64 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -352,24 +380,28 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s64>), align 4) ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] + ; ; GFX8-LABEL: name: load_flat_v2s64 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s64>), align 4) ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] + ; ; GFX9-LABEL: name: load_flat_v2s64 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s64>), align 4) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] + ; ; GFX10-LABEL: name: load_flat_v2s64 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s64>), align 4) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] + ; ; GFX11-LABEL: name: load_flat_v2s64 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -399,24 +431,28 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load (<2 x p1>), align 4) ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; ; GFX8-LABEL: name: load_flat_v2p1 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load (<2 x p1>), align 4) ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; ; GFX9-LABEL: name: load_flat_v2p1 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load (<2 x p1>), align 4) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; ; GFX10-LABEL: name: load_flat_v2p1 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load (<2 x p1>), align 4) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; ; GFX11-LABEL: name: load_flat_v2p1 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -446,24 +482,28 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_96(s96) = G_LOAD [[COPY]](p1) :: (load (s96), align 4) ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](s96) + ; ; GFX8-LABEL: name: load_flat_s96 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vreg_96(s96) = G_LOAD [[COPY]](p1) :: (load (s96), align 4) ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](s96) + ; ; GFX9-LABEL: name: load_flat_s96 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_96(s96) = G_LOAD [[COPY]](p1) :: (load (s96), align 4) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](s96) + ; ; GFX10-LABEL: name: load_flat_s96 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_96(s96) = G_LOAD [[COPY]](p1) :: (load (s96), align 4) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](s96) + ; ; GFX11-LABEL: name: load_flat_s96 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -493,24 +533,28 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4) ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128) + ; ; GFX8-LABEL: name: load_flat_s128 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4) ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128) + ; ; GFX9-LABEL: name: load_flat_s128 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128) + ; ; GFX10-LABEL: name: load_flat_s128 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128) + ; ; GFX11-LABEL: name: load_flat_s128 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -540,24 +584,28 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p3)) ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] + ; ; GFX8-LABEL: name: load_flat_p3_from_4 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p3)) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] + ; ; GFX9-LABEL: name: load_flat_p3_from_4 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p3)) ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] + ; ; GFX10-LABEL: name: load_flat_p3_from_4 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p3)) ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] + ; ; GFX11-LABEL: name: load_flat_p3_from_4 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -587,24 +635,28 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p1)) ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] + ; ; GFX8-LABEL: name: load_flat_p1_from_8 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p1)) ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] + ; ; GFX9-LABEL: name: load_flat_p1_from_8 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p1)) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] + ; ; GFX10-LABEL: name: load_flat_p1_from_8 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p1)) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] + ; ; GFX11-LABEL: name: load_flat_p1_from_8 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -634,24 +686,28 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999)) ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999) + ; ; GFX8-LABEL: name: load_flat_p999_from_8 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999)) ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999) + ; ; GFX9-LABEL: name: load_flat_p999_from_8 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999)) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999) + ; ; GFX10-LABEL: name: load_flat_p999_from_8 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999)) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999) + ; ; GFX11-LABEL: name: load_flat_p999_from_8 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -681,24 +737,28 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>)) ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) + ; ; GFX8-LABEL: name: load_flat_v2p3 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>)) ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) + ; ; GFX9-LABEL: name: load_flat_v2p3 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>)) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) + ; ; GFX10-LABEL: name: load_flat_v2p3 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>)) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) + ; ; GFX11-LABEL: name: load_flat_v2p3 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -728,24 +788,28 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s16>)) ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] + ; ; GFX8-LABEL: name: load_flat_v2s16 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s16>)) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] + ; ; GFX9-LABEL: name: load_flat_v2s16 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s16>)) ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] + ; ; GFX10-LABEL: name: load_flat_v2s16 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s16>)) ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] + ; ; GFX11-LABEL: name: load_flat_v2s16 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -775,24 +839,28 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s16>)) ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] + ; ; GFX8-LABEL: name: load_flat_v4s16 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s16>)) ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] + ; ; GFX9-LABEL: name: load_flat_v4s16 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s16>)) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] + ; ; GFX10-LABEL: name: load_flat_v4s16 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s16>)) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] + ; ; GFX11-LABEL: name: load_flat_v4s16 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -822,24 +890,28 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_96(<6 x s16>) = G_LOAD [[COPY]](p1) :: (load (<6 x s16>), align 4) ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<6 x s16>) + ; ; GFX8-LABEL: name: load_flat_v6s16 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vreg_96(<6 x s16>) = G_LOAD [[COPY]](p1) :: (load (<6 x s16>), align 4) ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<6 x s16>) + ; ; GFX9-LABEL: name: load_flat_v6s16 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_96(<6 x s16>) = G_LOAD [[COPY]](p1) :: (load (<6 x s16>), align 4) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<6 x s16>) + ; ; GFX10-LABEL: name: load_flat_v6s16 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_96(<6 x s16>) = G_LOAD [[COPY]](p1) :: (load (<6 x s16>), align 4) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<6 x s16>) + ; ; GFX11-LABEL: name: load_flat_v6s16 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -869,24 +941,28 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<8 x s16>), align 4) ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] + ; ; GFX8-LABEL: name: load_flat_v8s16 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<8 x s16>), align 4) ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] + ; ; GFX9-LABEL: name: load_flat_v8s16 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<8 x s16>), align 4) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] + ; ; GFX10-LABEL: name: load_flat_v8s16 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<8 x s16>), align 4) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] + ; ; GFX11-LABEL: name: load_flat_v8s16 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -918,56 +994,54 @@ body: | ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX8-LABEL: name: load_flat_s32_from_1_gep_2047 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec - ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX9-LABEL: name: load_flat_s32_from_1_gep_2047 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 2047, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX10-LABEL: name: load_flat_s32_from_1_gep_2047 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX11-LABEL: name: load_flat_s32_from_1_gep_2047 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -997,56 +1071,54 @@ body: | ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX8-LABEL: name: load_flat_s32_from_1_gep_2048 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec - ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX9-LABEL: name: load_flat_s32_from_1_gep_2048 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 2048, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX10-LABEL: name: load_flat_s32_from_1_gep_2048 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX11-LABEL: name: load_flat_s32_from_1_gep_2048 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -1076,81 +1148,75 @@ body: | ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2047, implicit $exec ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX8-LABEL: name: load_flat_s32_from_1_gep_m2047 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec - ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2047, implicit $exec ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX9-LABEL: name: load_flat_s32_from_1_gep_m2047 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec - ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2047, implicit $exec ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX10-LABEL: name: load_flat_s32_from_1_gep_m2047 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2047, implicit $exec ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX11-LABEL: name: load_flat_s32_from_1_gep_m2047 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec - ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2047, implicit $exec ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 -2047 @@ -1175,81 +1241,75 @@ body: | ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX8-LABEL: name: load_flat_s32_from_1_gep_m2048 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec - ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX9-LABEL: name: load_flat_s32_from_1_gep_m2048 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec - ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX10-LABEL: name: load_flat_s32_from_1_gep_m2048 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX11-LABEL: name: load_flat_s32_from_1_gep_m2048 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec - ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 -2048 @@ -1274,56 +1334,54 @@ body: | ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX8-LABEL: name: load_flat_s32_from_1_gep_4095 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec - ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX9-LABEL: name: load_flat_s32_from_1_gep_4095 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 4095, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX10-LABEL: name: load_flat_s32_from_1_gep_4095 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX11-LABEL: name: load_flat_s32_from_1_gep_4095 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -1353,81 +1411,75 @@ body: | ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4096, implicit $exec ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX8-LABEL: name: load_flat_s32_from_1_gep_4096 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4096, implicit $exec ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX9-LABEL: name: load_flat_s32_from_1_gep_4096 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4096, implicit $exec ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX10-LABEL: name: load_flat_s32_from_1_gep_4096 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4096, implicit $exec ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX11-LABEL: name: load_flat_s32_from_1_gep_4096 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4096, implicit $exec ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 4096 @@ -1452,81 +1504,75 @@ body: | ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4095, implicit $exec ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX8-LABEL: name: load_flat_s32_from_1_gep_m4095 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec - ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4095, implicit $exec ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX9-LABEL: name: load_flat_s32_from_1_gep_m4095 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec - ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4095, implicit $exec ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX10-LABEL: name: load_flat_s32_from_1_gep_m4095 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4095, implicit $exec ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX11-LABEL: name: load_flat_s32_from_1_gep_m4095 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec - ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4095, implicit $exec ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 -4095 @@ -1551,81 +1597,75 @@ body: | ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX8-LABEL: name: load_flat_s32_from_1_gep_m4096 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec - ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX9-LABEL: name: load_flat_s32_from_1_gep_m4096 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec - ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX10-LABEL: name: load_flat_s32_from_1_gep_m4096 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX11-LABEL: name: load_flat_s32_from_1_gep_m4096 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec - ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 -4096 @@ -1650,81 +1690,75 @@ body: | ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8191, implicit $exec ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX8-LABEL: name: load_flat_s32_from_1_gep_8191 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec - ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8191, implicit $exec ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX9-LABEL: name: load_flat_s32_from_1_gep_8191 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec - ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8191, implicit $exec ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX10-LABEL: name: load_flat_s32_from_1_gep_8191 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8191, implicit $exec ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX11-LABEL: name: load_flat_s32_from_1_gep_8191 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec - ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8191, implicit $exec ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 8191 @@ -1749,81 +1783,75 @@ body: | ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8192, implicit $exec ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX8-LABEL: name: load_flat_s32_from_1_gep_8192 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec - ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8192, implicit $exec ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX9-LABEL: name: load_flat_s32_from_1_gep_8192 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec - ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8192, implicit $exec ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX10-LABEL: name: load_flat_s32_from_1_gep_8192 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8192, implicit $exec ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX11-LABEL: name: load_flat_s32_from_1_gep_8192 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec - ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8192, implicit $exec ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 8192 @@ -1848,81 +1876,75 @@ body: | ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX8-LABEL: name: load_flat_s32_from_1_gep_m8191 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec - ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX9-LABEL: name: load_flat_s32_from_1_gep_m8191 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec - ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX10-LABEL: name: load_flat_s32_from_1_gep_m8191 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX11-LABEL: name: load_flat_s32_from_1_gep_m8191 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec - ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 -8191 @@ -1947,81 +1969,75 @@ body: | ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX8-LABEL: name: load_flat_s32_from_1_gep_m8192 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec - ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX9-LABEL: name: load_flat_s32_from_1_gep_m8192 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec - ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX10-LABEL: name: load_flat_s32_from_1_gep_m8192 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX11-LABEL: name: load_flat_s32_from_1_gep_m8192 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec - ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 -8192 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir index 7ffa65f922456..0103bfc9d39c1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir @@ -243,17 +243,15 @@ body: | ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 - ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 - ; GFX10-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX10-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY7]], [[COPY8]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY9]], [[COPY10]], killed [[V_ADD_CO_U32_e64_3]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_2]], %subreg.sub0, [[V_ADDC_U32_e64_2]], %subreg.sub1 - ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE2]], 0, 0, implicit $exec :: (load (s32), addrspace 1) + ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_2]], %subreg.sub0, [[V_ADDC_U32_e64_2]], %subreg.sub1 + ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s32), addrspace 1) ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] ; ; GFX11-LABEL: name: load_global_s32_from_sgpr_zext_vgpr_offset4095 @@ -310,17 +308,15 @@ body: | ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 - ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 - ; GFX10-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX10-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY7]], [[COPY8]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY9]], [[COPY10]], killed [[V_ADD_CO_U32_e64_3]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_2]], %subreg.sub0, [[V_ADDC_U32_e64_2]], %subreg.sub1 - ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE2]], 0, 0, implicit $exec :: (load (s32), addrspace 1) + ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_2]], %subreg.sub0, [[V_ADDC_U32_e64_2]], %subreg.sub1 + ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s32), addrspace 1) ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] ; ; GFX11-LABEL: name: load_global_s32_from_sgpr_zext_vgpr_offset_neg4096 @@ -440,35 +436,31 @@ body: | ; GFX9: liveins: $sgpr0_sgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294963199 - ; GFX9-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX9-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -4097 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0 ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1 ; GFX9-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc ; GFX9-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc - ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (load (s32), addrspace 1) + ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[REG_SEQUENCE]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (load (s32), addrspace 1) ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] ; ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_neg4097 ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294963199 - ; GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX10-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -4097 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0 ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1 ; GFX10-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc ; GFX10-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 - ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1) ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] ; @@ -476,17 +468,15 @@ body: | ; GFX11: liveins: $sgpr0_sgpr1 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294963199 - ; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX11-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -4097 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 - ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0 ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 - ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1 ; GFX11-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc ; GFX11-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc - ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 - ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] %0:sgpr(p1) = COPY $sgpr0_sgpr1 @@ -562,17 +552,15 @@ body: | ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294965247 - ; GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX10-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -2049 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0 ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1 ; GFX10-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc ; GFX10-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 - ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1) ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir index f26e23293dae7..27806edc91808 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir @@ -1212,34 +1212,30 @@ body: | ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec - ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) + ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; ; GFX8-LABEL: name: load_global_s32_from_1_gep_2047 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec - ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; ; GFX9-LABEL: name: load_global_s32_from_1_gep_2047 @@ -1309,34 +1305,30 @@ body: | ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec - ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) + ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; ; GFX8-LABEL: name: load_global_s32_from_1_gep_2048 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec - ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; ; GFX9-LABEL: name: load_global_s32_from_1_gep_2048 @@ -1350,17 +1342,15 @@ body: | ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] ; ; GFX11-LABEL: name: load_global_s32_from_1_gep_2048 @@ -1392,78 +1382,70 @@ body: | ; GFX6: liveins: $vgpr0_vgpr1 ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec - ; GFX6-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2047, implicit $exec ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 - ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 + ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] ; ; GFX7-LABEL: name: load_global_s32_from_1_gep_m2047 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2047, implicit $exec ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 - ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 + ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] ; ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m2047 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec - ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2047, implicit $exec ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) + ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; ; GFX8-LABEL: name: load_global_s32_from_1_gep_m2047 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec - ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2047, implicit $exec ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; ; GFX9-LABEL: name: load_global_s32_from_1_gep_m2047 @@ -1509,78 +1491,70 @@ body: | ; GFX6: liveins: $vgpr0_vgpr1 ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec - ; GFX6-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 - ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 + ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] ; ; GFX7-LABEL: name: load_global_s32_from_1_gep_m2048 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 - ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 + ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] ; ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m2048 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec - ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) + ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; ; GFX8-LABEL: name: load_global_s32_from_1_gep_m2048 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec - ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; ; GFX9-LABEL: name: load_global_s32_from_1_gep_m2048 @@ -1650,34 +1624,30 @@ body: | ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec - ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) + ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; ; GFX8-LABEL: name: load_global_s32_from_1_gep_4095 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec - ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; ; GFX9-LABEL: name: load_global_s32_from_1_gep_4095 @@ -1691,17 +1661,15 @@ body: | ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] ; ; GFX11-LABEL: name: load_global_s32_from_1_gep_4095 @@ -1759,85 +1727,75 @@ body: | ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4096, implicit $exec ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) + ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; ; GFX8-LABEL: name: load_global_s32_from_1_gep_4096 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4096, implicit $exec ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; ; GFX9-LABEL: name: load_global_s32_from_1_gep_4096 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4096, implicit $exec ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] ; ; GFX10-LABEL: name: load_global_s32_from_1_gep_4096 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4096, implicit $exec ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] ; ; GFX11-LABEL: name: load_global_s32_from_1_gep_4096 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4096, implicit $exec ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 4096 @@ -1862,78 +1820,70 @@ body: | ; GFX6: liveins: $vgpr0_vgpr1 ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec - ; GFX6-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4095, implicit $exec ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 - ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 + ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] ; ; GFX7-LABEL: name: load_global_s32_from_1_gep_m4095 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4095, implicit $exec ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 - ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 + ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] ; ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m4095 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec - ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4095, implicit $exec ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) + ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; ; GFX8-LABEL: name: load_global_s32_from_1_gep_m4095 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec - ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4095, implicit $exec ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; ; GFX9-LABEL: name: load_global_s32_from_1_gep_m4095 @@ -1947,17 +1897,15 @@ body: | ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4095, implicit $exec ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] ; ; GFX11-LABEL: name: load_global_s32_from_1_gep_m4095 @@ -1989,78 +1937,70 @@ body: | ; GFX6: liveins: $vgpr0_vgpr1 ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec - ; GFX6-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 - ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 + ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] ; ; GFX7-LABEL: name: load_global_s32_from_1_gep_m4096 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 - ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 + ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] ; ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m4096 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec - ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) + ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; ; GFX8-LABEL: name: load_global_s32_from_1_gep_m4096 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec - ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; ; GFX9-LABEL: name: load_global_s32_from_1_gep_m4096 @@ -2074,17 +2014,15 @@ body: | ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] ; ; GFX11-LABEL: name: load_global_s32_from_1_gep_m4096 @@ -2142,85 +2080,75 @@ body: | ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec - ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8191, implicit $exec ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) + ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; ; GFX8-LABEL: name: load_global_s32_from_1_gep_8191 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec - ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8191, implicit $exec ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; ; GFX9-LABEL: name: load_global_s32_from_1_gep_8191 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec - ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8191, implicit $exec ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] ; ; GFX10-LABEL: name: load_global_s32_from_1_gep_8191 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8191, implicit $exec ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] ; ; GFX11-LABEL: name: load_global_s32_from_1_gep_8191 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec - ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8191, implicit $exec ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 8191 @@ -2271,85 +2199,75 @@ body: | ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec - ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8192, implicit $exec ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) + ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; ; GFX8-LABEL: name: load_global_s32_from_1_gep_8192 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec - ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8192, implicit $exec ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; ; GFX9-LABEL: name: load_global_s32_from_1_gep_8192 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec - ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8192, implicit $exec ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] ; ; GFX10-LABEL: name: load_global_s32_from_1_gep_8192 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8192, implicit $exec ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] ; ; GFX11-LABEL: name: load_global_s32_from_1_gep_8192 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec - ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8192, implicit $exec ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 8192 @@ -2374,129 +2292,115 @@ body: | ; GFX6: liveins: $vgpr0_vgpr1 ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec - ; GFX6-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 - ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 + ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] ; ; GFX7-LABEL: name: load_global_s32_from_1_gep_m8191 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 - ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 + ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] ; ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m8191 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec - ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) + ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; ; GFX8-LABEL: name: load_global_s32_from_1_gep_m8191 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec - ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; ; GFX9-LABEL: name: load_global_s32_from_1_gep_m8191 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec - ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] ; ; GFX10-LABEL: name: load_global_s32_from_1_gep_m8191 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] ; ; GFX11-LABEL: name: load_global_s32_from_1_gep_m8191 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec - ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 -8191 @@ -2521,129 +2425,115 @@ body: | ; GFX6: liveins: $vgpr0_vgpr1 ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec - ; GFX6-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 - ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 + ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] ; ; GFX7-LABEL: name: load_global_s32_from_1_gep_m8192 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 - ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 + ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] ; ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m8192 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec - ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) + ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; ; GFX8-LABEL: name: load_global_s32_from_1_gep_m8192 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec - ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; ; GFX9-LABEL: name: load_global_s32_from_1_gep_m8192 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec - ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] ; ; GFX10-LABEL: name: load_global_s32_from_1_gep_m8192 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] ; ; GFX11-LABEL: name: load_global_s32_from_1_gep_m8192 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec - ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 -8192 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir index edace19c47b16..ad53a2fd81120 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir @@ -193,7 +193,7 @@ body: | # Test a load of an offset from a constant base address # GCN-LABEL: name: constant_address_positive{{$}} -# GCN: %0:sreg_64 = S_MOV_B64 44 +# GCN: %0:sreg_64 = S_MOV_B64_IMM_PSEUDO 44 # VI: %3:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0, 64, 0 :: (dereferenceable invariant load (s32), addrspace 4) # SICI: %3:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0, 16, 0 :: (dereferenceable invariant load (s32), addrspace 4) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptrmask.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptrmask.mir index 1a20e55958742..3f020aaa0365c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptrmask.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptrmask.mir @@ -314,8 +314,8 @@ body: | ; CHECK: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], [[S_MOV_B64_]], implicit-def dead $scc + ; CHECK-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0 + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], [[S_MOV_B]], implicit-def dead $scc ; CHECK-NEXT: S_ENDPGM 0, implicit [[S_AND_B64_]] %0:sgpr(p0) = COPY $sgpr0_sgpr1 %1:sgpr(s64) = G_CONSTANT i64 0 @@ -441,7 +441,7 @@ body: | ; CHECK: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64 -2 + ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64_IMM_PSEUDO -2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY %const.sub0 @@ -468,7 +468,7 @@ body: | ; CHECK: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64 -4 + ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64_IMM_PSEUDO -4 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY %const.sub0 @@ -495,7 +495,7 @@ body: | ; CHECK: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64 -8 + ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64_IMM_PSEUDO -8 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY %const.sub0 @@ -522,7 +522,7 @@ body: | ; CHECK: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64 -16 + ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64_IMM_PSEUDO -16 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY %const.sub0 @@ -549,9 +549,7 @@ body: | ; CHECK: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 3758096384 - ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 - ; CHECK-NEXT: %const:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64_IMM_PSEUDO -536870912 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY %const.sub0 @@ -776,9 +774,7 @@ body: | ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967294, implicit $exec - ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; CHECK-NEXT: %const:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; CHECK-NEXT: %const:vreg_64 = V_MOV_B64_PSEUDO -2, implicit $exec ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY %const.sub0 @@ -805,9 +801,7 @@ body: | ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967292, implicit $exec - ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; CHECK-NEXT: %const:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; CHECK-NEXT: %const:vreg_64 = V_MOV_B64_PSEUDO -4, implicit $exec ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY %const.sub0 @@ -834,9 +828,7 @@ body: | ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967292, implicit $exec - ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; CHECK-NEXT: %const:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; CHECK-NEXT: %const:vreg_64 = V_MOV_B64_PSEUDO -4, implicit $exec ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY %const.sub0 @@ -863,9 +855,7 @@ body: | ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967280, implicit $exec - ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; CHECK-NEXT: %const:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; CHECK-NEXT: %const:vreg_64 = V_MOV_B64_PSEUDO -16, implicit $exec ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY %const.sub0 @@ -892,9 +882,7 @@ body: | ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3758096384, implicit $exec - ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; CHECK-NEXT: %const:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; CHECK-NEXT: %const:vreg_64 = V_MOV_B64_PSEUDO -536870912, implicit $exec ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY %const.sub0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir index c7520de936aa8..fc6925ee5709c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir @@ -22,24 +22,28 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + ; ; GFX8-LABEL: name: store_flat_s32_to_4 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX8-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + ; ; GFX9-LABEL: name: store_flat_s32_to_4 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + ; ; GFX10-LABEL: name: store_flat_s32_to_4 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX10-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + ; ; GFX11-LABEL: name: store_flat_s32_to_4 ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} @@ -68,24 +72,28 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7-NEXT: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16)) + ; ; GFX8-LABEL: name: store_flat_s32_to_2 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX8-NEXT: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16)) + ; ; GFX9-LABEL: name: store_flat_s32_to_2 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9-NEXT: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16)) + ; ; GFX10-LABEL: name: store_flat_s32_to_2 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX10-NEXT: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16)) + ; ; GFX11-LABEL: name: store_flat_s32_to_2 ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} @@ -114,24 +122,28 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7-NEXT: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8)) + ; ; GFX8-LABEL: name: store_flat_s32_to_1 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX8-NEXT: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8)) + ; ; GFX9-LABEL: name: store_flat_s32_to_1 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9-NEXT: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8)) + ; ; GFX10-LABEL: name: store_flat_s32_to_1 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX10-NEXT: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8)) + ; ; GFX11-LABEL: name: store_flat_s32_to_1 ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} @@ -161,24 +173,28 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX7-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) + ; ; GFX8-LABEL: name: store_flat_s64 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX8-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) + ; ; GFX9-LABEL: name: store_flat_s64 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX9-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) + ; ; GFX10-LABEL: name: store_flat_s64 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX10-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) + ; ; GFX11-LABEL: name: store_flat_s64 ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX11-NEXT: {{ $}} @@ -207,24 +223,28 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(s96) = COPY $vgpr2_vgpr3_vgpr4 ; GFX7-NEXT: G_STORE [[COPY1]](s96), [[COPY]](p1) :: (store (s96), align 16) + ; ; GFX8-LABEL: name: store_flat_s96 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(s96) = COPY $vgpr2_vgpr3_vgpr4 ; GFX8-NEXT: G_STORE [[COPY1]](s96), [[COPY]](p1) :: (store (s96), align 16) + ; ; GFX9-LABEL: name: store_flat_s96 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(s96) = COPY $vgpr2_vgpr3_vgpr4 ; GFX9-NEXT: G_STORE [[COPY1]](s96), [[COPY]](p1) :: (store (s96), align 16) + ; ; GFX10-LABEL: name: store_flat_s96 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(s96) = COPY $vgpr2_vgpr3_vgpr4 ; GFX10-NEXT: G_STORE [[COPY1]](s96), [[COPY]](p1) :: (store (s96), align 16) + ; ; GFX11-LABEL: name: store_flat_s96 ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4 ; GFX11-NEXT: {{ $}} @@ -253,24 +273,28 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7-NEXT: G_STORE [[COPY1]](s128), [[COPY]](p1) :: (store (s128)) + ; ; GFX8-LABEL: name: store_flat_s128 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX8-NEXT: G_STORE [[COPY1]](s128), [[COPY]](p1) :: (store (s128)) + ; ; GFX9-LABEL: name: store_flat_s128 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX9-NEXT: G_STORE [[COPY1]](s128), [[COPY]](p1) :: (store (s128)) + ; ; GFX10-LABEL: name: store_flat_s128 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX10-NEXT: G_STORE [[COPY1]](s128), [[COPY]](p1) :: (store (s128)) + ; ; GFX11-LABEL: name: store_flat_s128 ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX11-NEXT: {{ $}} @@ -300,24 +324,28 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX7-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s32>)) + ; ; GFX8-LABEL: name: store_flat_v2s32 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX8-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s32>)) + ; ; GFX9-LABEL: name: store_flat_v2s32 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX9-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s32>)) + ; ; GFX10-LABEL: name: store_flat_v2s32 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX10-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s32>)) + ; ; GFX11-LABEL: name: store_flat_v2s32 ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX11-NEXT: {{ $}} @@ -346,24 +374,28 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_96 = COPY $vgpr2_vgpr3_vgpr4 ; GFX7-NEXT: FLAT_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<3 x s32>), align 16) + ; ; GFX8-LABEL: name: store_flat_v3s32 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_96 = COPY $vgpr2_vgpr3_vgpr4 ; GFX8-NEXT: FLAT_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<3 x s32>), align 16) + ; ; GFX9-LABEL: name: store_flat_v3s32 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_96 = COPY $vgpr2_vgpr3_vgpr4 ; GFX9-NEXT: FLAT_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<3 x s32>), align 16) + ; ; GFX10-LABEL: name: store_flat_v3s32 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_96 = COPY $vgpr2_vgpr3_vgpr4 ; GFX10-NEXT: FLAT_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<3 x s32>), align 16) + ; ; GFX11-LABEL: name: store_flat_v3s32 ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4 ; GFX11-NEXT: {{ $}} @@ -392,24 +424,28 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s32>)) + ; ; GFX8-LABEL: name: store_flat_v4s32 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX8-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s32>)) + ; ; GFX9-LABEL: name: store_flat_v4s32 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX9-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s32>)) + ; ; GFX10-LABEL: name: store_flat_v4s32 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX10-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s32>)) + ; ; GFX11-LABEL: name: store_flat_v4s32 ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX11-NEXT: {{ $}} @@ -439,24 +475,28 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s16>)) + ; ; GFX8-LABEL: name: store_flat_v2s16 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX8-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s16>)) + ; ; GFX9-LABEL: name: store_flat_v2s16 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s16>)) + ; ; GFX10-LABEL: name: store_flat_v2s16 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX10-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s16>)) + ; ; GFX11-LABEL: name: store_flat_v2s16 ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} @@ -486,24 +526,28 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX7-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s16>)) + ; ; GFX8-LABEL: name: store_flat_v4s16 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX8-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s16>)) + ; ; GFX9-LABEL: name: store_flat_v4s16 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX9-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s16>)) + ; ; GFX10-LABEL: name: store_flat_v4s16 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX10-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s16>)) + ; ; GFX11-LABEL: name: store_flat_v4s16 ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX11-NEXT: {{ $}} @@ -533,24 +577,28 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(<6 x s16>) = COPY $vgpr2_vgpr3_vgpr4 ; GFX7-NEXT: G_STORE [[COPY1]](<6 x s16>), [[COPY]](p1) :: (store (<6 x s16>), align 16) + ; ; GFX8-LABEL: name: store_flat_v6s16 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(<6 x s16>) = COPY $vgpr2_vgpr3_vgpr4 ; GFX8-NEXT: G_STORE [[COPY1]](<6 x s16>), [[COPY]](p1) :: (store (<6 x s16>), align 16) + ; ; GFX9-LABEL: name: store_flat_v6s16 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(<6 x s16>) = COPY $vgpr2_vgpr3_vgpr4 ; GFX9-NEXT: G_STORE [[COPY1]](<6 x s16>), [[COPY]](p1) :: (store (<6 x s16>), align 16) + ; ; GFX10-LABEL: name: store_flat_v6s16 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(<6 x s16>) = COPY $vgpr2_vgpr3_vgpr4 ; GFX10-NEXT: G_STORE [[COPY1]](<6 x s16>), [[COPY]](p1) :: (store (<6 x s16>), align 16) + ; ; GFX11-LABEL: name: store_flat_v6s16 ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4 ; GFX11-NEXT: {{ $}} @@ -579,24 +627,28 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<8 x s16>)) + ; ; GFX8-LABEL: name: store_flat_v8s16 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX8-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<8 x s16>)) + ; ; GFX9-LABEL: name: store_flat_v8s16 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX9-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<8 x s16>)) + ; ; GFX10-LABEL: name: store_flat_v8s16 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX10-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<8 x s16>)) + ; ; GFX11-LABEL: name: store_flat_v8s16 ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX11-NEXT: {{ $}} @@ -626,24 +678,28 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s64>)) + ; ; GFX8-LABEL: name: store_flat_v2s64 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX8-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s64>)) + ; ; GFX9-LABEL: name: store_flat_v2s64 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX9-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s64>)) + ; ; GFX10-LABEL: name: store_flat_v2s64 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX10-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s64>)) + ; ; GFX11-LABEL: name: store_flat_v2s64 ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX11-NEXT: {{ $}} @@ -673,24 +729,28 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX7-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p1)) + ; ; GFX8-LABEL: name: store_flat_p1 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX8-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p1)) + ; ; GFX9-LABEL: name: store_flat_p1 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX9-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p1)) + ; ; GFX10-LABEL: name: store_flat_p1 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX10-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p1)) + ; ; GFX11-LABEL: name: store_flat_p1 ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX11-NEXT: {{ $}} @@ -720,24 +780,28 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p1>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7-NEXT: G_STORE [[COPY1]](<2 x p1>), [[COPY]](p1) :: (store (<2 x p1>)) + ; ; GFX8-LABEL: name: store_flat_v2p1 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p1>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX8-NEXT: G_STORE [[COPY1]](<2 x p1>), [[COPY]](p1) :: (store (<2 x p1>)) + ; ; GFX9-LABEL: name: store_flat_v2p1 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p1>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX9-NEXT: G_STORE [[COPY1]](<2 x p1>), [[COPY]](p1) :: (store (<2 x p1>)) + ; ; GFX10-LABEL: name: store_flat_v2p1 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p1>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX10-NEXT: G_STORE [[COPY1]](<2 x p1>), [[COPY]](p1) :: (store (<2 x p1>)) + ; ; GFX11-LABEL: name: store_flat_v2p1 ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX11-NEXT: {{ $}} @@ -767,24 +831,28 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p3)) + ; ; GFX8-LABEL: name: store_flat_p3 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX8-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p3)) + ; ; GFX9-LABEL: name: store_flat_p3 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p3)) + ; ; GFX10-LABEL: name: store_flat_p3 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX10-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p3)) + ; ; GFX11-LABEL: name: store_flat_p3 ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} @@ -814,24 +882,28 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p3>) = COPY $vgpr2_vgpr3 ; GFX7-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>)) + ; ; GFX8-LABEL: name: store_flat_v2p3 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p3>) = COPY $vgpr2_vgpr3 ; GFX8-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>)) + ; ; GFX9-LABEL: name: store_flat_v2p3 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p3>) = COPY $vgpr2_vgpr3 ; GFX9-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>)) + ; ; GFX10-LABEL: name: store_flat_v2p3 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p3>) = COPY $vgpr2_vgpr3 ; GFX10-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>)) + ; ; GFX11-LABEL: name: store_flat_v2p3 ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX11-NEXT: {{ $}} @@ -860,24 +932,28 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s32)) + ; ; GFX8-LABEL: name: store_atomic_flat_s32 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX8-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s32)) + ; ; GFX9-LABEL: name: store_atomic_flat_s32 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s32)) + ; ; GFX10-LABEL: name: store_atomic_flat_s32 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX10-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s32)) + ; ; GFX11-LABEL: name: store_atomic_flat_s32 ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} @@ -907,24 +983,28 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX7-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s64)) + ; ; GFX8-LABEL: name: store_atomic_flat_s64 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX8-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s64)) + ; ; GFX9-LABEL: name: store_atomic_flat_s64 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX9-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s64)) + ; ; GFX10-LABEL: name: store_atomic_flat_s64 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX10-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s64)) + ; ; GFX11-LABEL: name: store_atomic_flat_s64 ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX11-NEXT: {{ $}} @@ -953,55 +1033,53 @@ body: | ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec - ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + ; ; GFX8-LABEL: name: store_flat_s32_gep_2047 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec - ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + ; ; GFX9-LABEL: name: store_flat_s32_gep_2047 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 2047, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + ; ; GFX10-LABEL: name: store_flat_s32_gep_2047 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec - ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + ; ; GFX11-LABEL: name: store_flat_s32_gep_2047 ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir index c56ba70b667d9..95e01aa413a37 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir @@ -1168,34 +1168,30 @@ body: | ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec - ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-FLAT-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7-FLAT-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-FLAT-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) + ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-FLAT-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) ; ; GFX8-LABEL: name: store_global_s32_gep_2047 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec - ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) ; ; GFX9-LABEL: name: store_global_s32_gep_2047 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll index 13f7885207105..ce27598f69b3f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll @@ -1638,21 +1638,21 @@ define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(ptr addrspace(1) % define amdgpu_kernel void @test_div_scale_f64_val_undef_val(ptr addrspace(1) %out) #0 { ; GFX7-LABEL: test_div_scale_f64_val_undef_val: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7-NEXT: s_mov_b32 s3, 0x40200000 -; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, 0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0x40200000 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], s[0:1], s[0:1], v[0:1] ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_scale_f64_val_undef_val: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s2, 0 -; GFX8-NEXT: s_mov_b32 s3, 0x40200000 -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0x40200000 +; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], s[0:1], s[0:1], v[0:1] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, s1 @@ -1662,22 +1662,18 @@ define amdgpu_kernel void @test_div_scale_f64_val_undef_val(ptr addrspace(1) %ou ; ; GFX10-LABEL: test_div_scale_f64_val_undef_val: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s2, 0 -; GFX10-NEXT: s_mov_b32 s3, 0x40200000 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[0:1], s[0:1], s[2:3] +; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[0:1], s[0:1], 0x40200000 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_div_scale_f64_val_undef_val: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_mov_b32 s3, 0x40200000 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[0:1], s[0:1], s[2:3] +; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[0:1], s[0:1], 0x40200000 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll index 02d1a92c69373..4ab963ed85cca 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll @@ -25,11 +25,8 @@ define i32 @global_atomic_csub_offset(ptr addrspace(1) %ptr, i32 %data) { ; GFX10-LABEL: global_atomic_csub_offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b64 s[4:5], 0x1000 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: v_mov_b32_e32 v4, s5 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -37,12 +34,8 @@ define i32 @global_atomic_csub_offset(ptr addrspace(1) %ptr, i32 %data) { ; GFX11-LABEL: global_atomic_csub_offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b64 s[0:1], 0x1000 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -73,11 +66,8 @@ define void @global_atomic_csub_offset_nortn(ptr addrspace(1) %ptr, i32 %data) { ; GFX10-LABEL: global_atomic_csub_offset_nortn: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b64 s[4:5], 0x1000 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: v_mov_b32_e32 v4, s5 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -85,12 +75,8 @@ define void @global_atomic_csub_offset_nortn(ptr addrspace(1) %ptr, i32 %data) { ; GFX11-LABEL: global_atomic_csub_offset_nortn: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b64 s[0:1], 0x1000 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll index aecfbe7aa2260..3ccedb0733d51 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll @@ -317,31 +317,31 @@ bb: define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, double %a, double %b) #0 { ; GCN-LABEL: test_mfma_f64_16x16x4f64_imm: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x24 -; GCN-NEXT: s_mov_b64 s[4:5], 0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GCN-NEXT: s_mov_b64 s[10:11], 1.0 -; GCN-NEXT: s_mov_b64 s[6:7], s[4:5] +; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x34 +; GCN-NEXT: s_mov_b64 s[0:1], 0 +; GCN-NEXT: s_mov_b64 s[6:7], 1.0 +; GCN-NEXT: s_mov_b64 s[2:3], s[0:1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_pk_mov_b32 v[0:1], s[14:15], s[14:15] op_sel:[0,1] -; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] -; GCN-NEXT: v_accvgpr_write_b32 a0, s4 -; GCN-NEXT: v_accvgpr_write_b32 a1, s5 -; GCN-NEXT: v_accvgpr_write_b32 a2, s6 -; GCN-NEXT: v_accvgpr_write_b32 a3, s7 -; GCN-NEXT: v_accvgpr_write_b32 a4, s8 -; GCN-NEXT: v_accvgpr_write_b32 a5, s9 -; GCN-NEXT: v_accvgpr_write_b32 a6, s10 -; GCN-NEXT: v_accvgpr_write_b32 a7, s11 -; GCN-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1] +; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] +; GCN-NEXT: v_accvgpr_write_b32 a0, s0 +; GCN-NEXT: v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1] +; GCN-NEXT: v_accvgpr_write_b32 a1, s1 +; GCN-NEXT: v_accvgpr_write_b32 a2, s2 +; GCN-NEXT: v_accvgpr_write_b32 a3, s3 +; GCN-NEXT: v_accvgpr_write_b32 a4, s4 +; GCN-NEXT: v_accvgpr_write_b32 a5, s5 +; GCN-NEXT: v_accvgpr_write_b32 a6, s6 +; GCN-NEXT: v_accvgpr_write_b32 a7, s7 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 0 -; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[12:13] -; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[12:13] offset:16 +; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9] +; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9] offset:16 ; GCN-NEXT: s_endpgm bb: %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> , i32 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll index 0ec4f64b38a1b..a4d5fe4ffa5a7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll @@ -12,11 +12,12 @@ define amdgpu_cs void @memmove_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src ; LOOP-NEXT: s_xor_b64 s[4:5], exec, s[0:1] ; LOOP-NEXT: s_cbranch_execz .LBB0_3 ; LOOP-NEXT: ; %bb.1: ; %copy_forward -; LOOP-NEXT: s_mov_b64 s[0:1], 0 +; LOOP-NEXT: s_mov_b64 s[6:7], 0 ; LOOP-NEXT: s_mov_b32 s2, 0 ; LOOP-NEXT: s_mov_b32 s3, 0xf000 -; LOOP-NEXT: v_mov_b32_e32 v5, s1 -; LOOP-NEXT: v_mov_b32_e32 v4, s0 +; LOOP-NEXT: s_mov_b64 s[0:1], 0 +; LOOP-NEXT: v_mov_b32_e32 v4, s6 +; LOOP-NEXT: v_mov_b32_e32 v5, s7 ; LOOP-NEXT: .LBB0_2: ; %copy_forward_loop ; LOOP-NEXT: ; =>This Inner Loop Header: Depth=1 ; LOOP-NEXT: v_add_i32_e32 v6, vcc, v2, v4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll index 7cd3babc70909..3edd2e0914a6e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll @@ -7,11 +7,12 @@ declare void @llvm.memset.p1.i32(ptr addrspace(1), i8, i32, i1) define amdgpu_cs void @memset_p1i8(ptr addrspace(1) %dst, i8 %val) { ; LOOP-LABEL: memset_p1i8: ; LOOP: ; %bb.0: ; %loadstoreloop.preheader -; LOOP-NEXT: s_mov_b64 s[0:1], 0 +; LOOP-NEXT: s_mov_b64 s[4:5], 0 ; LOOP-NEXT: s_mov_b32 s2, 0 ; LOOP-NEXT: s_mov_b32 s3, 0xf000 -; LOOP-NEXT: v_mov_b32_e32 v4, s1 -; LOOP-NEXT: v_mov_b32_e32 v3, s0 +; LOOP-NEXT: s_mov_b64 s[0:1], 0 +; LOOP-NEXT: v_mov_b32_e32 v3, s4 +; LOOP-NEXT: v_mov_b32_e32 v4, s5 ; LOOP-NEXT: .LBB0_1: ; %loadstoreloop ; LOOP-NEXT: ; =>This Inner Loop Header: Depth=1 ; LOOP-NEXT: v_add_i32_e32 v5, vcc, v0, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll index 4248f7b6a1583..9dacdbc46be19 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -2259,12 +2259,13 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-LABEL: v_sdiv_i64_pow2_shl_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b64 s[4:5], 0x1000 -; CHECK-NEXT: v_lshl_b64 v[5:6], s[4:5], v2 -; CHECK-NEXT: v_mov_b32_e32 v4, v1 ; CHECK-NEXT: v_mov_b32_e32 v3, v0 -; CHECK-NEXT: v_or_b32_e32 v1, v4, v6 +; CHECK-NEXT: v_mov_b32_e32 v4, v1 +; CHECK-NEXT: v_mov_b32_e32 v0, 0x1000 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_lshl_b64 v[5:6], v[0:1], v2 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_or_b32_e32 v1, v4, v6 ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -2717,161 +2718,163 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-LABEL: v_sdiv_v2i64_pow2_shl_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b64 s[6:7], 0x1000 -; CGP-NEXT: v_mov_b32_e32 v5, v2 -; CGP-NEXT: v_mov_b32_e32 v7, v3 -; CGP-NEXT: v_lshl_b64 v[2:3], s[6:7], v4 +; CGP-NEXT: s_mov_b64 s[4:5], 0x1000 +; CGP-NEXT: v_lshl_b64 v[11:12], s[4:5], v4 ; CGP-NEXT: v_mov_b32_e32 v9, v1 ; CGP-NEXT: v_mov_b32_e32 v8, v0 -; CGP-NEXT: v_or_b32_e32 v1, v9, v3 +; CGP-NEXT: v_or_b32_e32 v1, v9, v12 ; CGP-NEXT: v_mov_b32_e32 v0, 0 +; CGP-NEXT: v_mov_b32_e32 v5, v2 +; CGP-NEXT: v_mov_b32_e32 v7, v3 +; CGP-NEXT: v_mov_b32_e32 v2, 0x1000 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CGP-NEXT: v_mov_b32_e32 v3, 0 ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz .LBB8_2 ; CGP-NEXT: ; %bb.1: -; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v2, v0 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v0, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v1, v0 -; CGP-NEXT: v_xor_b32_e32 v1, v3, v0 -; CGP-NEXT: v_cvt_f32_u32_e32 v3, v2 -; CGP-NEXT: v_cvt_f32_u32_e32 v4, v1 -; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v2 -; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v1, vcc -; CGP-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 -; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; CGP-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 -; CGP-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; CGP-NEXT: v_trunc_f32_e32 v10, v4 -; CGP-NEXT: v_mac_f32_e32 v3, 0xcf800000, v10 -; CGP-NEXT: v_cvt_u32_f32_e32 v12, v3 -; CGP-NEXT: v_cvt_u32_f32_e32 v15, v10 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v15, v3 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11] -; CGP-NEXT: v_mul_hi_u32 v11, v12, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v15, v3 -; CGP-NEXT: v_mul_lo_u32 v16, v12, v10 -; CGP-NEXT: v_mul_lo_u32 v17, v15, v10 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v16 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v12, v10 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v16, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v17, v3 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v16, v11 -; CGP-NEXT: v_mul_hi_u32 v10, v15, v10 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v3 -; CGP-NEXT: v_addc_u32_e32 v15, vcc, v15, v4, vcc -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[4:5] -; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v13 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11] -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v13, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v4, v13 -; CGP-NEXT: v_mul_lo_u32 v4, v15, v3 -; CGP-NEXT: v_mul_lo_u32 v9, v12, v10 -; CGP-NEXT: v_xor_b32_e32 v14, v8, v13 -; CGP-NEXT: v_mul_hi_u32 v8, v12, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v15, v3 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v8, v15, v10 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v9, v12, v10 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v8, v3 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v15, v10 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v12, v3 -; CGP-NEXT: v_addc_u32_e32 v4, vcc, v15, v4, vcc -; CGP-NEXT: v_mul_lo_u32 v8, v14, v3 -; CGP-NEXT: v_mul_lo_u32 v9, v11, v4 -; CGP-NEXT: v_mul_hi_u32 v10, v11, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v14, v3 -; CGP-NEXT: v_mul_hi_u32 v12, v14, v4 +; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v12 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v11, v0 +; CGP-NEXT: v_addc_u32_e32 v10, vcc, v12, v0, vcc +; CGP-NEXT: v_xor_b32_e32 v4, v1, v0 +; CGP-NEXT: v_xor_b32_e32 v1, v10, v0 +; CGP-NEXT: v_cvt_f32_u32_e32 v10, v4 +; CGP-NEXT: v_cvt_f32_u32_e32 v11, v1 +; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v4 +; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v1, vcc +; CGP-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11 +; CGP-NEXT: v_rcp_iflag_f32_e32 v10, v10 +; CGP-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 +; CGP-NEXT: v_mul_f32_e32 v11, 0x2f800000, v10 +; CGP-NEXT: v_trunc_f32_e32 v12, v11 +; CGP-NEXT: v_mac_f32_e32 v10, 0xcf800000, v12 +; CGP-NEXT: v_cvt_u32_f32_e32 v13, v10 +; CGP-NEXT: v_cvt_u32_f32_e32 v16, v12 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0 +; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[11:12] +; CGP-NEXT: v_mul_hi_u32 v17, v13, v10 +; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12] +; CGP-NEXT: v_mul_lo_u32 v12, v16, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v16, v10 +; CGP-NEXT: v_mul_lo_u32 v18, v13, v11 +; CGP-NEXT: v_mul_lo_u32 v19, v16, v11 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v18 +; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v17 +; CGP-NEXT: v_mul_hi_u32 v17, v13, v11 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v18, v12 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v19, v10 +; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v17 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v17, vcc, v18, v17 +; CGP-NEXT: v_mul_hi_u32 v11, v16, v11 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v17, v12 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v10 +; CGP-NEXT: v_addc_u32_e32 v16, vcc, v16, v11, vcc +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0 +; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[11:12] +; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v14 +; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12] +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v14, vcc +; CGP-NEXT: v_xor_b32_e32 v12, v8, v14 +; CGP-NEXT: v_mul_lo_u32 v8, v16, v10 +; CGP-NEXT: v_mul_lo_u32 v15, v13, v11 +; CGP-NEXT: v_xor_b32_e32 v17, v9, v14 +; CGP-NEXT: v_mul_hi_u32 v9, v13, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v16, v10 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v14, v4 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_mul_hi_u32 v9, v11, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v3 +; CGP-NEXT: v_mul_lo_u32 v9, v16, v11 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v15, v8 +; CGP-NEXT: v_mul_hi_u32 v15, v13, v11 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v9 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v15 +; CGP-NEXT: v_mul_hi_u32 v11, v16, v11 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v3, v8 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v10, 0 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v8 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v2, v12, v[4:5] -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v11, v3 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v10, v[8:9] -; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v14, v8, vcc -; CGP-NEXT: v_sub_i32_e64 v8, s[4:5], v14, v8 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 -; CGP-NEXT: v_subb_u32_e32 v8, vcc, v8, v1, vcc -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v2 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v3, v2 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 -; CGP-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v8, vcc -; CGP-NEXT: v_cndmask_b32_e64 v4, v9, v11, s[4:5] -; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v10 -; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v8, v1 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v8, v1 -; CGP-NEXT: v_cndmask_b32_e32 v1, v14, v2, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v9 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, 0, v11, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v16, v9, vcc +; CGP-NEXT: v_mul_lo_u32 v10, v17, v8 +; CGP-NEXT: v_mul_lo_u32 v11, v12, v9 +; CGP-NEXT: v_mul_hi_u32 v13, v12, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v17, v8 +; CGP-NEXT: v_mul_hi_u32 v15, v17, v9 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v17, v9 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_mul_hi_u32 v11, v12, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v8, v10 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v13, 0 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v15, v10 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v11, v[9:10] +; CGP-NEXT: v_sub_i32_e32 v8, vcc, v12, v8 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v1, v13, v[9:10] +; CGP-NEXT: v_subb_u32_e64 v10, s[4:5], v17, v9, vcc +; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v17, v9 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v1 +; CGP-NEXT: v_subb_u32_e32 v9, vcc, v9, v1, vcc +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v4 +; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v4 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v1 +; CGP-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc +; CGP-NEXT: v_cndmask_b32_e64 v10, v12, v15, s[4:5] +; CGP-NEXT: v_add_i32_e32 v12, vcc, 1, v13 +; CGP-NEXT: v_addc_u32_e32 v15, vcc, 0, v11, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v9, v1 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, -1, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v8, v4 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v9, v1 +; CGP-NEXT: v_cndmask_b32_e32 v1, v16, v4, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v12 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v15, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CGP-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc -; CGP-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; CGP-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v3, v13, v0 -; CGP-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v1, v3 -; CGP-NEXT: v_xor_b32_e32 v1, v2, v3 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc -; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 +; CGP-NEXT: v_cndmask_b32_e32 v1, v12, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v4, v15, v8, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; CGP-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc +; CGP-NEXT: v_xor_b32_e32 v8, v14, v0 +; CGP-NEXT: v_cndmask_b32_e32 v4, v11, v4, vcc +; CGP-NEXT: v_xor_b32_e32 v0, v1, v8 +; CGP-NEXT: v_xor_b32_e32 v1, v4, v8 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc +; CGP-NEXT: ; implicit-def: $vgpr11_vgpr12 ; CGP-NEXT: ; implicit-def: $vgpr8 ; CGP-NEXT: .LBB8_2: ; %Flow1 -; CGP-NEXT: s_or_saveexec_b64 s[8:9], s[8:9] -; CGP-NEXT: v_lshl_b64 v[9:10], s[6:7], v6 -; CGP-NEXT: s_xor_b64 exec, exec, s[8:9] +; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] +; CGP-NEXT: v_lshl_b64 v[9:10], v[2:3], v6 +; CGP-NEXT: s_xor_b64 exec, exec, s[6:7] ; CGP-NEXT: s_cbranch_execz .LBB8_4 ; CGP-NEXT: ; %bb.3: -; CGP-NEXT: v_cvt_f32_u32_e32 v0, v2 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 +; CGP-NEXT: v_cvt_f32_u32_e32 v0, v11 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v11 ; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2879,19 +2882,19 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 -; CGP-NEXT: v_mul_lo_u32 v1, v0, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v0 +; CGP-NEXT: v_mul_lo_u32 v1, v0, v11 +; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v8, v1 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v1, v2 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v0 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v11 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CGP-NEXT: v_sub_i32_e64 v2, s[4:5], v1, v11 +; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v0 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v11 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CGP-NEXT: v_mov_b32_e32 v1, 0 ; CGP-NEXT: .LBB8_4: -; CGP-NEXT: s_or_b64 exec, exec, s[8:9] +; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: v_or_b32_e32 v3, v7, v10 ; CGP-NEXT: v_mov_b32_e32 v2, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll index d0c55c69f5087..d1599ac489a5f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -2219,12 +2219,13 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-LABEL: v_srem_i64_pow2_shl_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b64 s[4:5], 0x1000 -; CHECK-NEXT: v_lshl_b64 v[5:6], s[4:5], v2 -; CHECK-NEXT: v_mov_b32_e32 v4, v1 ; CHECK-NEXT: v_mov_b32_e32 v3, v0 -; CHECK-NEXT: v_or_b32_e32 v1, v4, v6 +; CHECK-NEXT: v_mov_b32_e32 v4, v1 +; CHECK-NEXT: v_mov_b32_e32 v0, 0x1000 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_lshl_b64 v[5:6], v[0:1], v2 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_or_b32_e32 v1, v4, v6 ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -2671,159 +2672,164 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-LABEL: v_srem_v2i64_pow2_shl_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b64 s[6:7], 0x1000 -; CGP-NEXT: v_mov_b32_e32 v5, v2 -; CGP-NEXT: v_mov_b32_e32 v7, v3 -; CGP-NEXT: v_lshl_b64 v[2:3], s[6:7], v4 +; CGP-NEXT: s_mov_b64 s[4:5], 0x1000 +; CGP-NEXT: v_lshl_b64 v[11:12], s[4:5], v4 ; CGP-NEXT: v_mov_b32_e32 v9, v1 ; CGP-NEXT: v_mov_b32_e32 v8, v0 -; CGP-NEXT: v_or_b32_e32 v1, v9, v3 +; CGP-NEXT: v_or_b32_e32 v1, v9, v12 ; CGP-NEXT: v_mov_b32_e32 v0, 0 +; CGP-NEXT: v_mov_b32_e32 v5, v2 +; CGP-NEXT: v_mov_b32_e32 v7, v3 +; CGP-NEXT: v_mov_b32_e32 v2, 0x1000 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CGP-NEXT: v_mov_b32_e32 v3, 0 ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz .LBB8_2 ; CGP-NEXT: ; %bb.1: -; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v3 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v2, v1 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v3, v1, vcc +; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v12 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v11, v1 +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v12, v1, vcc ; CGP-NEXT: v_xor_b32_e32 v0, v0, v1 -; CGP-NEXT: v_xor_b32_e32 v1, v2, v1 -; CGP-NEXT: v_cvt_f32_u32_e32 v2, v0 -; CGP-NEXT: v_cvt_f32_u32_e32 v3, v1 -; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v0 -; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v1, vcc -; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 -; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; CGP-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; CGP-NEXT: v_trunc_f32_e32 v4, v3 -; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v10, v2 +; CGP-NEXT: v_xor_b32_e32 v1, v4, v1 +; CGP-NEXT: v_cvt_f32_u32_e32 v4, v0 +; CGP-NEXT: v_cvt_f32_u32_e32 v10, v1 +; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v0 +; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v1, vcc +; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v10 +; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; CGP-NEXT: v_mul_f32_e32 v10, 0x2f800000, v4 +; CGP-NEXT: v_trunc_f32_e32 v12, v10 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v12 ; CGP-NEXT: v_cvt_u32_f32_e32 v13, v4 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v10, 0 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v13, v[3:4] -; CGP-NEXT: v_mul_hi_u32 v14, v10, v2 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v10, v[3:4] -; CGP-NEXT: v_mul_lo_u32 v4, v13, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v13, v2 -; CGP-NEXT: v_mul_lo_u32 v15, v10, v3 -; CGP-NEXT: v_mul_lo_u32 v16, v13, v3 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v14 -; CGP-NEXT: v_mul_hi_u32 v14, v10, v3 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v16, v2 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_mul_hi_u32 v3, v13, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v2 -; CGP-NEXT: v_addc_u32_e32 v13, vcc, v13, v3, vcc -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v10, 0 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v13, v[3:4] -; CGP-NEXT: v_ashrrev_i32_e32 v11, 31, v9 -; CGP-NEXT: v_mul_hi_u32 v14, v10, v2 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v10, v[3:4] -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v11 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v11, vcc -; CGP-NEXT: v_xor_b32_e32 v9, v4, v11 -; CGP-NEXT: v_mul_lo_u32 v4, v13, v2 -; CGP-NEXT: v_mul_lo_u32 v12, v10, v3 -; CGP-NEXT: v_mul_hi_u32 v2, v13, v2 -; CGP-NEXT: v_xor_b32_e32 v8, v8, v11 +; CGP-NEXT: v_cvt_u32_f32_e32 v16, v12 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0 +; CGP-NEXT: v_mov_b32_e32 v4, v11 +; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5] +; CGP-NEXT: v_mul_lo_u32 v4, v16, v10 +; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12] +; CGP-NEXT: v_mul_hi_u32 v12, v13, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v16, v10 +; CGP-NEXT: v_mul_lo_u32 v17, v13, v11 +; CGP-NEXT: v_mul_lo_u32 v18, v16, v11 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v17 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v14 +; CGP-NEXT: v_mul_hi_u32 v12, v13, v11 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v14, v13, v3 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4 -; CGP-NEXT: v_mul_hi_u32 v12, v10, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v14, v2 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v17, v4 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v18, v10 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 -; CGP-NEXT: v_mul_hi_u32 v3, v13, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v13, v3, vcc -; CGP-NEXT: v_mul_lo_u32 v4, v8, v2 -; CGP-NEXT: v_mul_lo_u32 v10, v9, v3 -; CGP-NEXT: v_mul_hi_u32 v12, v9, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v8, v2 -; CGP-NEXT: v_mul_hi_u32 v13, v8, v3 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v12, v8, v3 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v17, v12 +; CGP-NEXT: v_mul_hi_u32 v11, v16, v11 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; CGP-NEXT: v_mul_hi_u32 v10, v9, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v12, v2 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v2, v4 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v12, 0 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v4 +; CGP-NEXT: v_addc_u32_e32 v16, vcc, v16, v10, vcc +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0 +; CGP-NEXT: v_mov_b32_e32 v4, v11 +; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5] +; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v14 +; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12] +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v14, vcc +; CGP-NEXT: v_xor_b32_e32 v12, v4, v14 +; CGP-NEXT: v_mul_lo_u32 v4, v16, v10 +; CGP-NEXT: v_mul_lo_u32 v9, v13, v11 +; CGP-NEXT: v_xor_b32_e32 v15, v8, v14 +; CGP-NEXT: v_mul_hi_u32 v8, v13, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v16, v10 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; CGP-NEXT: v_mul_lo_u32 v8, v16, v11 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CGP-NEXT: v_mul_hi_u32 v9, v13, v11 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_mul_hi_u32 v10, v16, v11 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4] -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v12, v[3:4] -; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v8, v3, vcc -; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v8, v3 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc -; CGP-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[4:5] -; CGP-NEXT: v_sub_i32_e32 v9, vcc, v2, v0 -; CGP-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v1 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v0 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v16, v8, vcc +; CGP-NEXT: v_mul_lo_u32 v9, v15, v4 +; CGP-NEXT: v_mul_lo_u32 v10, v12, v8 +; CGP-NEXT: v_mul_hi_u32 v11, v12, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v15, v4 +; CGP-NEXT: v_mul_hi_u32 v13, v15, v8 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v11, v15, v8 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_mul_hi_u32 v10, v12, v8 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v4, v9 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v11, 0 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v4 +; CGP-NEXT: v_mov_b32_e32 v4, v9 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v0, v10, v[4:5] +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v12, v8 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v1, v11, v[9:10] +; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v15, v9, vcc +; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v15, v9 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v1 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v0 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v1 +; CGP-NEXT: v_subb_u32_e32 v9, vcc, v9, v1, vcc +; CGP-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] +; CGP-NEXT: v_sub_i32_e32 v11, vcc, v4, v0 +; CGP-NEXT: v_subbrev_u32_e64 v12, s[4:5], 0, v9, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v1 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v1 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v9, v0 -; CGP-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v0 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v1 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v9, v1, vcc +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v0 +; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v15, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; CGP-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc -; CGP-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; CGP-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; CGP-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v11 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v11 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v11 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v11, vcc -; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; CGP-NEXT: v_cndmask_b32_e32 v0, v11, v0, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v12, v1, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc +; CGP-NEXT: v_xor_b32_e32 v0, v0, v14 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v14 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v14 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v14, vcc +; CGP-NEXT: ; implicit-def: $vgpr11_vgpr12 ; CGP-NEXT: ; implicit-def: $vgpr8 ; CGP-NEXT: .LBB8_2: ; %Flow1 -; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[8:9] -; CGP-NEXT: v_lshl_b64 v[9:10], s[6:7], v6 +; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] +; CGP-NEXT: v_lshl_b64 v[9:10], v[2:3], v6 ; CGP-NEXT: s_xor_b64 exec, exec, s[4:5] ; CGP-NEXT: s_cbranch_execz .LBB8_4 ; CGP-NEXT: ; %bb.3: -; CGP-NEXT: v_cvt_f32_u32_e32 v0, v2 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 +; CGP-NEXT: v_cvt_f32_u32_e32 v0, v11 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v11 ; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2831,13 +2837,13 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 -; CGP-NEXT: v_mul_lo_u32 v0, v0, v2 +; CGP-NEXT: v_mul_lo_u32 v0, v0, v11 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v8, v0 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v11 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v11 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v11 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v11 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; CGP-NEXT: v_mov_b32_e32 v1, 0 ; CGP-NEXT: .LBB8_4: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll index 77737b356ff6e..b3b57e14cb3fb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -1070,11 +1070,12 @@ define i64 @v_udiv_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v3, v0 ; CHECK-NEXT: v_mov_b32_e32 v4, v1 -; CHECK-NEXT: s_mov_b64 s[4:5], 0x1000 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: v_lshl_b64 v[5:6], s[4:5], v2 -; CHECK-NEXT: v_or_b32_e32 v1, v4, v6 -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CHECK-NEXT: v_mov_b32_e32 v0, 0x1000 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mov_b32_e32 v7, 0 +; CHECK-NEXT: v_lshl_b64 v[5:6], v[0:1], v2 +; CHECK-NEXT: v_or_b32_e32 v8, v4, v6 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[7:8] ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v5 ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -1509,20 +1510,22 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_mov_b32_e32 v9, v1 ; CGP-NEXT: v_mov_b32_e32 v5, v2 ; CGP-NEXT: v_mov_b32_e32 v7, v3 -; CGP-NEXT: s_mov_b64 s[6:7], 0x1000 +; CGP-NEXT: s_mov_b64 s[4:5], 0x1000 +; CGP-NEXT: v_mov_b32_e32 v10, 0x1000 +; CGP-NEXT: v_mov_b32_e32 v11, 0 ; CGP-NEXT: v_mov_b32_e32 v0, 0 -; CGP-NEXT: v_lshl_b64 v[2:3], s[6:7], v4 +; CGP-NEXT: v_lshl_b64 v[2:3], s[4:5], v4 ; CGP-NEXT: v_or_b32_e32 v1, v9, v3 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz .LBB8_2 ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v3 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 -; CGP-NEXT: v_subb_u32_e32 v10, vcc, 0, v3, vcc +; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v3, vcc ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v0 ; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v4 ; CGP-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -1531,105 +1534,105 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_mac_f32_e32 v0, 0xcf800000, v4 ; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0 -; CGP-NEXT: v_mul_lo_u32 v11, v1, v4 -; CGP-NEXT: v_mul_lo_u32 v12, v1, v0 -; CGP-NEXT: v_mul_lo_u32 v13, v10, v0 -; CGP-NEXT: v_mul_hi_u32 v14, v1, v0 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; CGP-NEXT: v_mul_lo_u32 v13, v4, v12 -; CGP-NEXT: v_mul_hi_u32 v15, v0, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v4, v12 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v14 -; CGP-NEXT: v_mul_lo_u32 v14, v0, v11 -; CGP-NEXT: v_mul_lo_u32 v16, v4, v11 -; CGP-NEXT: v_mul_hi_u32 v17, v0, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v4, v11 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v16, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v1, v4 +; CGP-NEXT: v_mul_lo_u32 v14, v1, v0 +; CGP-NEXT: v_mul_lo_u32 v15, v12, v0 +; CGP-NEXT: v_mul_hi_u32 v16, v1, v0 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; CGP-NEXT: v_mul_lo_u32 v15, v4, v14 +; CGP-NEXT: v_mul_hi_u32 v17, v0, v14 +; CGP-NEXT: v_mul_hi_u32 v14, v4, v14 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; CGP-NEXT: v_mul_lo_u32 v16, v0, v13 +; CGP-NEXT: v_mul_lo_u32 v18, v4, v13 +; CGP-NEXT: v_mul_hi_u32 v19, v0, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v4, v13 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v16 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v18, v14 +; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v17 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v19 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v18, v17 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v17 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v4, v13, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v1, v0 +; CGP-NEXT: v_mul_lo_u32 v12, v12, v0 +; CGP-NEXT: v_mul_hi_u32 v14, v1, v0 +; CGP-NEXT: v_mul_lo_u32 v1, v1, v4 +; CGP-NEXT: v_mul_lo_u32 v15, v4, v13 +; CGP-NEXT: v_mul_hi_u32 v16, v0, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v4, v13 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v12, v1 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v14 +; CGP-NEXT: v_mul_lo_u32 v12, v0, v1 +; CGP-NEXT: v_mul_lo_u32 v14, v4, v1 +; CGP-NEXT: v_mul_hi_u32 v17, v0, v1 +; CGP-NEXT: v_mul_hi_u32 v1, v4, v1 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v15, v12 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v15 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v16 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v17 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v15, v12 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v13 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v12 -; CGP-NEXT: v_addc_u32_e32 v4, vcc, v4, v11, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v1, v0 -; CGP-NEXT: v_mul_lo_u32 v10, v10, v0 -; CGP-NEXT: v_mul_hi_u32 v12, v1, v0 -; CGP-NEXT: v_mul_lo_u32 v1, v1, v4 -; CGP-NEXT: v_mul_lo_u32 v13, v4, v11 -; CGP-NEXT: v_mul_hi_u32 v14, v0, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v4, v11 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v12 -; CGP-NEXT: v_mul_lo_u32 v10, v0, v1 -; CGP-NEXT: v_mul_lo_u32 v12, v4, v1 -; CGP-NEXT: v_mul_hi_u32 v15, v0, v1 -; CGP-NEXT: v_mul_hi_u32 v1, v4, v1 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v10 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v14 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v15 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v10 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; CGP-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc ; CGP-NEXT: v_mul_lo_u32 v4, v9, v0 -; CGP-NEXT: v_mul_hi_u32 v10, v8, v0 +; CGP-NEXT: v_mul_hi_u32 v12, v8, v0 ; CGP-NEXT: v_mul_hi_u32 v0, v9, v0 -; CGP-NEXT: v_mul_lo_u32 v11, v8, v1 -; CGP-NEXT: v_mul_lo_u32 v12, v9, v1 -; CGP-NEXT: v_mul_hi_u32 v13, v8, v1 +; CGP-NEXT: v_mul_lo_u32 v13, v8, v1 +; CGP-NEXT: v_mul_lo_u32 v14, v9, v1 +; CGP-NEXT: v_mul_hi_u32 v15, v8, v1 ; CGP-NEXT: v_mul_hi_u32 v1, v9, v1 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v12, v0 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v15 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; CGP-NEXT: v_mul_lo_u32 v10, v2, v0 -; CGP-NEXT: v_mul_lo_u32 v11, v3, v0 -; CGP-NEXT: v_mul_hi_u32 v12, v2, v0 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4 +; CGP-NEXT: v_mul_lo_u32 v12, v2, v0 +; CGP-NEXT: v_mul_lo_u32 v13, v3, v0 +; CGP-NEXT: v_mul_hi_u32 v14, v2, v0 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; CGP-NEXT: v_mul_lo_u32 v4, v2, v1 -; CGP-NEXT: v_add_i32_e32 v13, vcc, 1, v0 -; CGP-NEXT: v_addc_u32_e32 v14, vcc, 0, v1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; CGP-NEXT: v_add_i32_e32 v11, vcc, 1, v13 -; CGP-NEXT: v_addc_u32_e32 v15, vcc, 0, v14, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v10 -; CGP-NEXT: v_subb_u32_e64 v10, s[4:5], v9, v4, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, 1, v0 +; CGP-NEXT: v_addc_u32_e32 v16, vcc, 0, v1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; CGP-NEXT: v_add_i32_e32 v13, vcc, 1, v15 +; CGP-NEXT: v_addc_u32_e32 v17, vcc, 0, v16, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v14 +; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v12 +; CGP-NEXT: v_subb_u32_e64 v12, s[4:5], v9, v4, vcc ; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v9, v4 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v3 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v3 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] ; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v3, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v10, v3 -; CGP-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v12, v3 +; CGP-NEXT: v_cndmask_b32_e32 v9, v14, v9, vcc ; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v2 ; CGP-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v8, v2 @@ -1639,8 +1642,8 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; CGP-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; CGP-NEXT: v_cndmask_b32_e32 v2, v13, v11, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v14, v15, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v15, v13, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v16, v17, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc @@ -1648,9 +1651,9 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CGP-NEXT: ; implicit-def: $vgpr8 ; CGP-NEXT: .LBB8_2: ; %Flow1 -; CGP-NEXT: s_or_saveexec_b64 s[8:9], s[8:9] -; CGP-NEXT: v_lshl_b64 v[9:10], s[6:7], v6 -; CGP-NEXT: s_xor_b64 exec, exec, s[8:9] +; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] +; CGP-NEXT: v_lshl_b64 v[9:10], v[10:11], v6 +; CGP-NEXT: s_xor_b64 exec, exec, s[6:7] ; CGP-NEXT: s_cbranch_execz .LBB8_4 ; CGP-NEXT: ; %bb.3: ; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v4 @@ -1673,7 +1676,7 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: v_mov_b32_e32 v1, 0 ; CGP-NEXT: .LBB8_4: -; CGP-NEXT: s_or_b64 exec, exec, s[8:9] +; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: v_or_b32_e32 v3, v7, v10 ; CGP-NEXT: v_mov_b32_e32 v2, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll index 097f6642cbc66..ecf7cc921886c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -1576,11 +1576,12 @@ define i64 @v_urem_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v3, v0 ; CHECK-NEXT: v_mov_b32_e32 v4, v1 -; CHECK-NEXT: s_mov_b64 s[4:5], 0x1000 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: v_lshl_b64 v[5:6], s[4:5], v2 -; CHECK-NEXT: v_or_b32_e32 v1, v4, v6 -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CHECK-NEXT: v_mov_b32_e32 v0, 0x1000 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mov_b32_e32 v7, 0 +; CHECK-NEXT: v_lshl_b64 v[5:6], v[0:1], v2 +; CHECK-NEXT: v_or_b32_e32 v8, v4, v6 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[7:8] ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v5 ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -2010,20 +2011,22 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_mov_b32_e32 v9, v1 ; CGP-NEXT: v_mov_b32_e32 v5, v2 ; CGP-NEXT: v_mov_b32_e32 v7, v3 -; CGP-NEXT: s_mov_b64 s[6:7], 0x1000 +; CGP-NEXT: s_mov_b64 s[4:5], 0x1000 +; CGP-NEXT: v_mov_b32_e32 v10, 0x1000 +; CGP-NEXT: v_mov_b32_e32 v11, 0 ; CGP-NEXT: v_mov_b32_e32 v0, 0 -; CGP-NEXT: v_lshl_b64 v[2:3], s[6:7], v4 +; CGP-NEXT: v_lshl_b64 v[2:3], s[4:5], v4 ; CGP-NEXT: v_or_b32_e32 v1, v9, v3 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz .LBB8_2 ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v3 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 -; CGP-NEXT: v_subb_u32_e32 v10, vcc, 0, v3, vcc +; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v3, vcc ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v0 ; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v4 ; CGP-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2032,92 +2035,92 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_mac_f32_e32 v0, 0xcf800000, v4 ; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0 -; CGP-NEXT: v_mul_lo_u32 v11, v1, v4 -; CGP-NEXT: v_mul_lo_u32 v12, v1, v0 -; CGP-NEXT: v_mul_lo_u32 v13, v10, v0 -; CGP-NEXT: v_mul_hi_u32 v14, v1, v0 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; CGP-NEXT: v_mul_lo_u32 v13, v4, v12 -; CGP-NEXT: v_mul_hi_u32 v15, v0, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v4, v12 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v14 -; CGP-NEXT: v_mul_lo_u32 v14, v0, v11 -; CGP-NEXT: v_mul_lo_u32 v16, v4, v11 -; CGP-NEXT: v_mul_hi_u32 v17, v0, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v4, v11 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v16, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v1, v4 +; CGP-NEXT: v_mul_lo_u32 v14, v1, v0 +; CGP-NEXT: v_mul_lo_u32 v15, v12, v0 +; CGP-NEXT: v_mul_hi_u32 v16, v1, v0 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; CGP-NEXT: v_mul_lo_u32 v15, v4, v14 +; CGP-NEXT: v_mul_hi_u32 v17, v0, v14 +; CGP-NEXT: v_mul_hi_u32 v14, v4, v14 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; CGP-NEXT: v_mul_lo_u32 v16, v0, v13 +; CGP-NEXT: v_mul_lo_u32 v18, v4, v13 +; CGP-NEXT: v_mul_hi_u32 v19, v0, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v4, v13 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v16 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v18, v14 +; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v17 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v19 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v18, v17 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v17 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v4, v13, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v1, v0 +; CGP-NEXT: v_mul_lo_u32 v12, v12, v0 +; CGP-NEXT: v_mul_hi_u32 v14, v1, v0 +; CGP-NEXT: v_mul_lo_u32 v1, v1, v4 +; CGP-NEXT: v_mul_lo_u32 v15, v4, v13 +; CGP-NEXT: v_mul_hi_u32 v16, v0, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v4, v13 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v12, v1 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v14 +; CGP-NEXT: v_mul_lo_u32 v12, v0, v1 +; CGP-NEXT: v_mul_lo_u32 v14, v4, v1 +; CGP-NEXT: v_mul_hi_u32 v17, v0, v1 +; CGP-NEXT: v_mul_hi_u32 v1, v4, v1 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v15, v12 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v15 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v16 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v17 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v15, v12 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v13 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v12 -; CGP-NEXT: v_addc_u32_e32 v4, vcc, v4, v11, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v1, v0 -; CGP-NEXT: v_mul_lo_u32 v10, v10, v0 -; CGP-NEXT: v_mul_hi_u32 v12, v1, v0 -; CGP-NEXT: v_mul_lo_u32 v1, v1, v4 -; CGP-NEXT: v_mul_lo_u32 v13, v4, v11 -; CGP-NEXT: v_mul_hi_u32 v14, v0, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v4, v11 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v12 -; CGP-NEXT: v_mul_lo_u32 v10, v0, v1 -; CGP-NEXT: v_mul_lo_u32 v12, v4, v1 -; CGP-NEXT: v_mul_hi_u32 v15, v0, v1 -; CGP-NEXT: v_mul_hi_u32 v1, v4, v1 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v10 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v14 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v15 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v10 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; CGP-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc ; CGP-NEXT: v_mul_lo_u32 v4, v9, v0 -; CGP-NEXT: v_mul_hi_u32 v10, v8, v0 +; CGP-NEXT: v_mul_hi_u32 v12, v8, v0 ; CGP-NEXT: v_mul_hi_u32 v0, v9, v0 -; CGP-NEXT: v_mul_lo_u32 v11, v8, v1 -; CGP-NEXT: v_mul_lo_u32 v12, v9, v1 -; CGP-NEXT: v_mul_hi_u32 v13, v8, v1 +; CGP-NEXT: v_mul_lo_u32 v13, v8, v1 +; CGP-NEXT: v_mul_lo_u32 v14, v9, v1 +; CGP-NEXT: v_mul_hi_u32 v15, v8, v1 ; CGP-NEXT: v_mul_hi_u32 v1, v9, v1 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v12, v0 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v15 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; CGP-NEXT: v_mul_lo_u32 v10, v2, v0 -; CGP-NEXT: v_mul_lo_u32 v11, v3, v0 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4 +; CGP-NEXT: v_mul_lo_u32 v12, v2, v0 +; CGP-NEXT: v_mul_lo_u32 v13, v3, v0 ; CGP-NEXT: v_mul_hi_u32 v0, v2, v0 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; CGP-NEXT: v_mul_lo_u32 v1, v2, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v11, v1 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v8, v10 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v8, v12 ; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v9, v0, vcc ; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v9, v0 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v2 @@ -2128,19 +2131,19 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; CGP-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc ; CGP-NEXT: v_sub_i32_e32 v9, vcc, v1, v2 -; CGP-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v0, vcc +; CGP-NEXT: v_subbrev_u32_e64 v12, s[4:5], 0, v0, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v2 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] ; CGP-NEXT: v_subb_u32_e32 v0, vcc, v0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v10, v3 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v12, v3 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 ; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v10, v3 -; CGP-NEXT: v_cndmask_b32_e32 v3, v12, v11, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v12, v3 +; CGP-NEXT: v_cndmask_b32_e32 v3, v14, v13, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; CGP-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v0, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v12, v0, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; CGP-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; CGP-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc @@ -2148,8 +2151,8 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CGP-NEXT: ; implicit-def: $vgpr8 ; CGP-NEXT: .LBB8_2: ; %Flow1 -; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[8:9] -; CGP-NEXT: v_lshl_b64 v[9:10], s[6:7], v6 +; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] +; CGP-NEXT: v_lshl_b64 v[9:10], v[10:11], v6 ; CGP-NEXT: s_xor_b64 exec, exec, s[4:5] ; CGP-NEXT: s_cbranch_execz .LBB8_4 ; CGP-NEXT: ; %bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll index 8c4483fc118db..bf5843ea8047d 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -691,12 +691,12 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX90A-NEXT: s_sub_i32 s4, 0, s3 ; GFX90A-NEXT: v_mov_b32_e32 v19, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], 0, 0 -; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v0 -; GFX90A-NEXT: v_cvt_f32_f16_e32 v0, s9 -; GFX90A-NEXT: v_readfirstlane_b32 s10, v1 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], 0, 0 +; GFX90A-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v2 +; GFX90A-NEXT: v_cvt_f32_f16_e32 v2, s9 +; GFX90A-NEXT: v_readfirstlane_b32 s10, v3 ; GFX90A-NEXT: s_mul_i32 s4, s4, s10 ; GFX90A-NEXT: s_mul_hi_u32 s4, s10, s4 ; GFX90A-NEXT: s_add_i32 s10, s10, s4 @@ -713,7 +713,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: s_cselect_b32 s4, s10, s4 ; GFX90A-NEXT: s_lshr_b32 s9, s9, 16 ; GFX90A-NEXT: s_lshl_b64 s[12:13], s[4:5], 5 -; GFX90A-NEXT: v_cvt_f32_f16_e32 v1, s9 +; GFX90A-NEXT: v_cvt_f32_f16_e32 v3, s9 ; GFX90A-NEXT: s_lshl_b64 s[2:3], s[0:1], 5 ; GFX90A-NEXT: s_lshl_b64 s[10:11], s[6:7], 5 ; GFX90A-NEXT: s_or_b32 s10, s10, 28 @@ -737,7 +737,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: s_cbranch_scc0 .LBB3_10 ; GFX90A-NEXT: ; %bb.3: ; %bb14 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX90A-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], -1 ; GFX90A-NEXT: s_mov_b32 s9, s8 ; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] @@ -795,7 +795,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: v_cvt_f32_f16_e32 v22, v21 ; GFX90A-NEXT: v_cvt_f32_f16_sdwa v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX90A-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GFX90A-NEXT: v_pk_add_f32 v[24:25], v[0:1], v[14:15] +; GFX90A-NEXT: v_pk_add_f32 v[24:25], v[2:3], v[14:15] ; GFX90A-NEXT: v_pk_add_f32 v[26:27], v[14:15], 0 op_sel_hi:[1,0] ; GFX90A-NEXT: v_pk_add_f32 v[16:17], v[22:23], v[16:17] ; GFX90A-NEXT: v_pk_add_f32 v[14:15], v[20:21], v[14:15] diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index d342c4ffa37b0..2b444e5e0e1f3 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -7910,8 +7910,7 @@ define amdgpu_kernel void @urem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s0, s4 ; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: s_mov_b64 s[4:5], 0x1000 -; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s8 +; GFX6-NEXT: s_lshl_b64 s[4:5], 0x1000, s8 ; GFX6-NEXT: s_add_u32 s4, s4, -1 ; GFX6-NEXT: s_addc_u32 s5, s5, -1 ; GFX6-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] @@ -7923,10 +7922,9 @@ define amdgpu_kernel void @urem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[0:1], 0x1000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s2 ; GFX9-NEXT: s_add_u32 s0, s0, -1 ; GFX9-NEXT: s_addc_u32 s1, s1, -1 ; GFX9-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1] @@ -8000,45 +7998,43 @@ define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: urem_v2i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0xd -; GFX6-NEXT: s_mov_b64 s[12:13], 0x1000 -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b64 s[6:7], s[12:13], s6 -; GFX6-NEXT: s_lshl_b64 s[4:5], s[12:13], s4 -; GFX6-NEXT: s_add_u32 s4, s4, -1 -; GFX6-NEXT: s_addc_u32 s5, s5, -1 -; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5] -; GFX6-NEXT: s_add_u32 s4, s6, -1 -; GFX6-NEXT: s_addc_u32 s5, s7, -1 -; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NEXT: v_mov_b32_e32 v3, s3 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GFX6-NEXT: s_lshl_b64 s[10:11], 0x1000, s10 +; GFX6-NEXT: s_lshl_b64 s[8:9], 0x1000, s8 +; GFX6-NEXT: s_add_u32 s8, s8, -1 +; GFX6-NEXT: s_addc_u32 s9, s9, -1 +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], s[8:9] +; GFX6-NEXT: s_add_u32 s8, s10, -1 +; GFX6-NEXT: s_addc_u32 s9, s11, -1 +; GFX6-NEXT: s_and_b64 s[6:7], s[6:7], s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; GFX9-LABEL: urem_v2i64_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[10:11], s[2:3], s10 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 +; GFX9-NEXT: s_lshl_b64 s[2:3], 0x1000, s10 +; GFX9-NEXT: s_lshl_b64 s[8:9], 0x1000, s8 +; GFX9-NEXT: s_add_u32 s8, s8, -1 +; GFX9-NEXT: s_addc_u32 s9, s9, -1 +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], s[8:9] ; GFX9-NEXT: s_add_u32 s2, s2, -1 ; GFX9-NEXT: s_addc_u32 s3, s3, -1 -; GFX9-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3] -; GFX9-NEXT: s_add_u32 s4, s10, -1 -; GFX9-NEXT: s_addc_u32 s5, s11, -1 -; GFX9-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i64> , %y @@ -8297,12 +8293,11 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX6-LABEL: sdiv_i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xd -; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000 +; GFX6-NEXT: s_load_dword s2, s[0:1], 0xd ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 +; GFX6-NEXT: s_lshl_b64 s[2:3], 0x1000, s2 ; GFX6-NEXT: s_ashr_i32 s8, s3, 31 ; GFX6-NEXT: s_add_u32 s2, s2, s8 ; GFX6-NEXT: s_mov_b32 s9, s8 @@ -8334,17 +8329,16 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 ; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 -; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc +; GFX6-NEXT: v_mul_hi_u32 v7, v0, v2 ; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 ; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[12:13] +; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -8352,6 +8346,7 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0 +; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[12:13] ; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, s4, v0 @@ -8433,10 +8428,9 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_endpgm ; GFX9-LABEL: sdiv_i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x34 -; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[4:5], s[2:3], s4 +; GFX9-NEXT: s_lshl_b64 s[4:5], 0x1000, s2 ; GFX9-NEXT: s_ashr_i32 s2, s5, 31 ; GFX9-NEXT: s_add_u32 s4, s4, s2 ; GFX9-NEXT: s_mov_b32 s3, s2 @@ -8462,12 +8456,12 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: s_mul_hi_u32 s14, s0, s11 ; GFX9-NEXT: s_mul_i32 s13, s1, s11 ; GFX9-NEXT: s_add_i32 s12, s14, s12 -; GFX9-NEXT: s_add_i32 s12, s12, s13 ; GFX9-NEXT: s_mul_i32 s15, s0, s11 +; GFX9-NEXT: s_add_i32 s12, s12, s13 +; GFX9-NEXT: s_mul_hi_u32 s14, s11, s15 ; GFX9-NEXT: s_mul_hi_u32 s13, s11, s12 -; GFX9-NEXT: s_mul_i32 s14, s11, s12 -; GFX9-NEXT: s_mul_hi_u32 s11, s11, s15 -; GFX9-NEXT: s_add_u32 s11, s11, s14 +; GFX9-NEXT: s_mul_i32 s11, s11, s12 +; GFX9-NEXT: s_add_u32 s11, s14, s11 ; GFX9-NEXT: s_addc_u32 s13, 0, s13 ; GFX9-NEXT: s_mul_hi_u32 s16, s10, s15 ; GFX9-NEXT: s_mul_i32 s15, s10, s15 @@ -8884,10 +8878,9 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-LABEL: sdiv_v2i64_pow2_shl_denom: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b64 s[12:13], s[2:3], s10 -; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 +; GFX6-NEXT: s_lshl_b64 s[2:3], 0x1000, s8 +; GFX6-NEXT: s_lshl_b64 s[12:13], 0x1000, s10 ; GFX6-NEXT: s_ashr_i32 s14, s3, 31 ; GFX6-NEXT: s_add_u32 s2, s2, s14 ; GFX6-NEXT: s_mov_b32 s15, s14 @@ -8920,15 +8913,15 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 ; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v2 -; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 -; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 +; GFX6-NEXT: v_mul_hi_u32 v6, v1, v4 +; GFX6-NEXT: v_mul_lo_u32 v4, v1, v4 +; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc -; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -9141,11 +9134,10 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-LABEL: sdiv_v2i64_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[10:11], s[2:3], s10 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 +; GFX9-NEXT: s_lshl_b64 s[2:3], 0x1000, s8 +; GFX9-NEXT: s_lshl_b64 s[10:11], 0x1000, s10 ; GFX9-NEXT: s_ashr_i32 s8, s3, 31 ; GFX9-NEXT: s_add_u32 s2, s2, s8 ; GFX9-NEXT: s_mov_b32 s9, s8 @@ -9170,8 +9162,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_mul_hi_u32 s18, s0, s15 ; GFX9-NEXT: s_mul_i32 s17, s1, s15 ; GFX9-NEXT: s_add_i32 s16, s18, s16 -; GFX9-NEXT: s_add_i32 s16, s16, s17 ; GFX9-NEXT: s_mul_i32 s19, s0, s15 +; GFX9-NEXT: s_add_i32 s16, s16, s17 ; GFX9-NEXT: s_mul_hi_u32 s17, s15, s16 ; GFX9-NEXT: s_mul_i32 s18, s15, s16 ; GFX9-NEXT: s_mul_hi_u32 s15, s15, s19 @@ -9691,12 +9683,11 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX6-LABEL: srem_i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xd -; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000 +; GFX6-NEXT: s_load_dword s2, s[0:1], 0xd ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 +; GFX6-NEXT: s_lshl_b64 s[2:3], 0x1000, s2 ; GFX6-NEXT: s_ashr_i32 s4, s3, 31 ; GFX6-NEXT: s_add_u32 s2, s2, s4 ; GFX6-NEXT: s_mov_b32 s5, s4 @@ -9728,17 +9719,16 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 ; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 -; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc +; GFX6-NEXT: v_mul_hi_u32 v7, v0, v2 ; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 ; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX6-NEXT: s_xor_b64 s[12:13], s[2:3], s[10:11] +; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -9746,6 +9736,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0 +; GFX6-NEXT: s_xor_b64 s[12:13], s[2:3], s[10:11] ; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, s4, v0 @@ -9825,10 +9816,9 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_endpgm ; GFX9-LABEL: srem_i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x34 -; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 +; GFX9-NEXT: s_lshl_b64 s[2:3], 0x1000, s2 ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 ; GFX9-NEXT: s_add_u32 s2, s2, s4 ; GFX9-NEXT: s_mov_b32 s5, s4 @@ -9854,12 +9844,12 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: s_mul_hi_u32 s12, s0, s3 ; GFX9-NEXT: s_mul_i32 s11, s1, s3 ; GFX9-NEXT: s_add_i32 s10, s12, s10 -; GFX9-NEXT: s_add_i32 s10, s10, s11 ; GFX9-NEXT: s_mul_i32 s13, s0, s3 +; GFX9-NEXT: s_add_i32 s10, s10, s11 +; GFX9-NEXT: s_mul_hi_u32 s12, s3, s13 ; GFX9-NEXT: s_mul_hi_u32 s11, s3, s10 -; GFX9-NEXT: s_mul_i32 s12, s3, s10 -; GFX9-NEXT: s_mul_hi_u32 s3, s3, s13 -; GFX9-NEXT: s_add_u32 s3, s3, s12 +; GFX9-NEXT: s_mul_i32 s3, s3, s10 +; GFX9-NEXT: s_add_u32 s3, s12, s3 ; GFX9-NEXT: s_addc_u32 s11, 0, s11 ; GFX9-NEXT: s_mul_hi_u32 s14, s2, s13 ; GFX9-NEXT: s_mul_i32 s13, s2, s13 @@ -10063,11 +10053,10 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-LABEL: srem_v2i64_pow2_shl_denom: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_lshl_b64 s[16:17], s[2:3], s10 -; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 +; GFX6-NEXT: s_lshl_b64 s[2:3], 0x1000, s8 +; GFX6-NEXT: s_lshl_b64 s[16:17], 0x1000, s10 ; GFX6-NEXT: s_ashr_i32 s8, s3, 31 ; GFX6-NEXT: s_add_u32 s2, s2, s8 ; GFX6-NEXT: s_mov_b32 s9, s8 @@ -10101,15 +10090,15 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 ; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v2 -; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 -; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 +; GFX6-NEXT: v_mul_hi_u32 v6, v1, v4 +; GFX6-NEXT: v_mul_lo_u32 v4, v1, v4 +; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc -; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -10316,11 +10305,10 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-LABEL: srem_v2i64_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[10:11], s[2:3], s10 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 +; GFX9-NEXT: s_lshl_b64 s[2:3], 0x1000, s8 +; GFX9-NEXT: s_lshl_b64 s[10:11], 0x1000, s10 ; GFX9-NEXT: s_ashr_i32 s8, s3, 31 ; GFX9-NEXT: s_add_u32 s2, s2, s8 ; GFX9-NEXT: s_mov_b32 s9, s8 @@ -10345,8 +10333,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_mul_hi_u32 s16, s0, s3 ; GFX9-NEXT: s_mul_i32 s15, s1, s3 ; GFX9-NEXT: s_add_i32 s14, s16, s14 -; GFX9-NEXT: s_add_i32 s14, s14, s15 ; GFX9-NEXT: s_mul_i32 s17, s0, s3 +; GFX9-NEXT: s_add_i32 s14, s14, s15 ; GFX9-NEXT: s_mul_hi_u32 s15, s3, s14 ; GFX9-NEXT: s_mul_i32 s16, s3, s14 ; GFX9-NEXT: s_mul_hi_u32 s3, s3, s17 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index abd9a4159f8cc..60c3134445218 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -4416,9 +4416,9 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB18_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] @@ -4452,8 +4452,8 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: s_cbranch_execz .LBB18_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] @@ -4966,9 +4966,9 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB20_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] @@ -5002,8 +5002,8 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: s_cbranch_execz .LBB20_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] @@ -5516,9 +5516,9 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB22_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] @@ -5551,8 +5551,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: s_cbranch_execz .LBB22_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] @@ -6061,9 +6061,9 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB24_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] @@ -6096,8 +6096,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: s_cbranch_execz .LBB24_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll index 6efbd6ce87385..8082a0646d4a1 100644 --- a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll +++ b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll @@ -2533,8 +2533,7 @@ define i1 @test124(i32 %arg1, i64 %arg2) { ; GCN-LABEL: test124: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b64 s[0:1], 0x3e8 -; GCN-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[1:2] +; GCN-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0x3e8, v[1:2] ; GCN-NEXT: v_cmp_gt_i32_e64 s0, 0x3e8, v0 ; GCN-NEXT: s_or_b32 s0, s0, vcc_lo ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 diff --git a/llvm/test/CodeGen/AMDGPU/commute-compares.ll b/llvm/test/CodeGen/AMDGPU/commute-compares.ll index c703a1dd7734d..6997913f1ae16 100644 --- a/llvm/test/CodeGen/AMDGPU/commute-compares.ll +++ b/llvm/test/CodeGen/AMDGPU/commute-compares.ll @@ -250,8 +250,8 @@ define amdgpu_kernel void @commute_ule_63_i64(ptr addrspace(1) %out, ptr addrspa ; FIXME: Undo canonicalization to gt (x + 1) since it doesn't use the inline imm ; GCN-LABEL: {{^}}commute_ule_64_i64: -; GCN-DAG: s_movk_i32 s[[KLO:[0-9]+]], 0x41{{$}} -; GCN: v_cmp_gt_u64_e32 vcc, s[[[KLO]]:{{[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} +; GCN: s_mov_b64 [[K:s\[[0-9:]+\]]], 0x41 +; GCN: v_cmp_gt_u64_e32 vcc, [[K]], v{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @commute_ule_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll index bd0c2b30eb5de..53c9861b7a051 100644 --- a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll +++ b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll @@ -310,8 +310,7 @@ define amdgpu_ps i64 @s_csh_64_1(i64 inreg %a, i64 inreg %b) { ; ; GISEL-LABEL: s_csh_64_1: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_mov_b64 s[4:5], 0xff -; GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[4:5] +; GISEL-NEXT: s_and_b64 s[2:3], s[2:3], 0xff ; GISEL-NEXT: s_lshl_b64 s[4:5], s[0:1], s2 ; GISEL-NEXT: s_lshr_b64 s[6:7], s[0:1], s2 ; GISEL-NEXT: s_ashr_i64 s[0:1], s[0:1], s2 diff --git a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll index 86d0df494bcac..d9e0ddd3b9044 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll @@ -27,29 +27,27 @@ define float @v_mul_42_f32(float %x) { } define double @v_mul_42_f64(double %x) { -; GFX9-LABEL: v_mul_42_f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: s_mov_b32 s5, 0x40450000 -; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-SDAG-LABEL: v_mul_42_f64: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: s_mov_b32 s4, 0 +; GFX9-SDAG-NEXT: s_mov_b32 s5, 0x40450000 +; GFX9-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_mul_42_f64: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: s_mov_b32 s5, 0x40450000 -; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX9-GISEL-LABEL: v_mul_42_f64: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x40450000 +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_mul_42_f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_mov_b32 s1, 0x40450000 -; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX1011-LABEL: v_mul_42_f64: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: v_mul_f64 v[0:1], 0x40450000, v[0:1] +; GFX1011-NEXT: s_setpc_b64 s[30:31] %mul = fmul double %x, 42.0 ret double %mul } @@ -726,9 +724,9 @@ define double @v_mul_0x1pn1031_f64(double %x) { ; GFX9-GISEL-LABEL: v_mul_0x1pn1031_f64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_movk_i32 s5, 0x800 -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x800 +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_0x1pn1031_f64: @@ -740,9 +738,7 @@ define double @v_mul_0x1pn1031_f64(double %x) { ; GFX10-GISEL-LABEL: v_mul_0x1pn1031_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX10-GISEL-NEXT: s_movk_i32 s5, 0x800 -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], 0x800, v[0:1] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_0x1pn1031_f64: @@ -754,9 +750,7 @@ define double @v_mul_0x1pn1031_f64(double %x) { ; GFX11-GISEL-LABEL: v_mul_0x1pn1031_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_movk_i32 s1, 0x800 -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], 0x800, v[0:1] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %mul = fmul double %x, 4.34584737989687770135e-311 ret double %mul @@ -774,9 +768,9 @@ define double @v_mul_0x1pn1022_f64(double %x) { ; GFX9-GISEL-LABEL: v_mul_0x1pn1022_f64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0x100000 -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x100000 +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_0x1pn1022_f64: @@ -788,9 +782,7 @@ define double @v_mul_0x1pn1022_f64(double %x) { ; GFX10-GISEL-LABEL: v_mul_0x1pn1022_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX10-GISEL-NEXT: s_mov_b32 s5, 0x100000 -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], 0x100000, v[0:1] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_0x1pn1022_f64: @@ -802,9 +794,7 @@ define double @v_mul_0x1pn1022_f64(double %x) { ; GFX11-GISEL-LABEL: v_mul_0x1pn1022_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s1, 0x100000 -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], 0x100000, v[0:1] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %mul = fmul double %x, 2.22507385850720138309e-308 ret double %mul @@ -822,9 +812,9 @@ define double @v_mul_0x1pn1021_f64(double %x) { ; GFX9-GISEL-LABEL: v_mul_0x1pn1021_f64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0x200000 -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x200000 +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_0x1pn1021_f64: @@ -836,9 +826,7 @@ define double @v_mul_0x1pn1021_f64(double %x) { ; GFX10-GISEL-LABEL: v_mul_0x1pn1021_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX10-GISEL-NEXT: s_mov_b32 s5, 0x200000 -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], 0x200000, v[0:1] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_0x1pn1021_f64: @@ -850,9 +838,7 @@ define double @v_mul_0x1pn1021_f64(double %x) { ; GFX11-GISEL-LABEL: v_mul_0x1pn1021_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s1, 0x200000 -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], 0x200000, v[0:1] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %mul = fmul double %x, 4.45014771701440276618e-308 ret double %mul @@ -870,9 +856,9 @@ define double @v_mul_0x1pn64_f64(double %x) { ; GFX9-GISEL-LABEL: v_mul_0x1pn64_f64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0x3bf00000 -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x3bf00000 +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_0x1pn64_f64: @@ -884,9 +870,7 @@ define double @v_mul_0x1pn64_f64(double %x) { ; GFX10-GISEL-LABEL: v_mul_0x1pn64_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX10-GISEL-NEXT: s_mov_b32 s5, 0x3bf00000 -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], 0x3bf00000, v[0:1] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_0x1pn64_f64: @@ -898,9 +882,7 @@ define double @v_mul_0x1pn64_f64(double %x) { ; GFX11-GISEL-LABEL: v_mul_0x1pn64_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s1, 0x3bf00000 -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], 0x3bf00000, v[0:1] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %mul = fmul double %x, 5.42101086242752217004e-20 ret double %mul @@ -918,9 +900,9 @@ define double @v_mul_0x1pn17_f64(double %x) { ; GFX9-GISEL-LABEL: v_mul_0x1pn17_f64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0x3ee00000 -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x3ee00000 +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_0x1pn17_f64: @@ -932,9 +914,7 @@ define double @v_mul_0x1pn17_f64(double %x) { ; GFX10-GISEL-LABEL: v_mul_0x1pn17_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX10-GISEL-NEXT: s_mov_b32 s5, 0x3ee00000 -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], 0x3ee00000, v[0:1] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_0x1pn17_f64: @@ -946,9 +926,7 @@ define double @v_mul_0x1pn17_f64(double %x) { ; GFX11-GISEL-LABEL: v_mul_0x1pn17_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s1, 0x3ee00000 -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], 0x3ee00000, v[0:1] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %mul = fmul double %x, 0.00000762939453125 ret double %mul @@ -965,9 +943,9 @@ define double @v_mul_0x1pn16_f64(double %x) { ; GFX9-GISEL-LABEL: v_mul_0x1pn16_f64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0x3ef00000 -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x3ef00000 +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_0x1pn16_f64: @@ -979,9 +957,7 @@ define double @v_mul_0x1pn16_f64(double %x) { ; GFX10-GISEL-LABEL: v_mul_0x1pn16_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX10-GISEL-NEXT: s_mov_b32 s5, 0x3ef00000 -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], 0x3ef00000, v[0:1] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_0x1pn16_f64: @@ -993,9 +969,7 @@ define double @v_mul_0x1pn16_f64(double %x) { ; GFX11-GISEL-LABEL: v_mul_0x1pn16_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s1, 0x3ef00000 -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], 0x3ef00000, v[0:1] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %mul = fmul double %x, 0.0000152587890625 ret double %mul @@ -1012,9 +986,9 @@ define double @v_mul_0x1pn15_f64(double %x) { ; GFX9-GISEL-LABEL: v_mul_0x1pn15_f64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0.5 -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0.5 +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_0x1pn15_f64: @@ -1026,9 +1000,7 @@ define double @v_mul_0x1pn15_f64(double %x) { ; GFX10-GISEL-LABEL: v_mul_0x1pn15_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX10-GISEL-NEXT: s_mov_b32 s5, 0.5 -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], 0x3f000000, v[0:1] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_0x1pn15_f64: @@ -1040,9 +1012,7 @@ define double @v_mul_0x1pn15_f64(double %x) { ; GFX11-GISEL-LABEL: v_mul_0x1pn15_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s1, 0.5 -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], 0x3f000000, v[0:1] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %mul = fmul double %x, 0.000030517578125 ret double %mul @@ -1058,9 +1028,9 @@ define double @v_mul_neg256_f64(double %x) { ; GFX9-GISEL-LABEL: v_mul_neg256_f64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0xc0700000 -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xc0700000 +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_neg256_f64: @@ -1072,9 +1042,7 @@ define double @v_mul_neg256_f64(double %x) { ; GFX10-GISEL-LABEL: v_mul_neg256_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX10-GISEL-NEXT: s_mov_b32 s5, 0xc0700000 -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], 0xc0700000, v[0:1] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_neg256_f64: @@ -1086,9 +1054,7 @@ define double @v_mul_neg256_f64(double %x) { ; GFX11-GISEL-LABEL: v_mul_neg256_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s1, 0xc0700000 -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], 0xc0700000, v[0:1] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %mul = fmul double %x, -256.0 ret double %mul @@ -1104,9 +1070,9 @@ define double @v_mul_neg128_f64(double %x) { ; GFX9-GISEL-LABEL: v_mul_neg128_f64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0xc0600000 -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xc0600000 +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_neg128_f64: @@ -1118,9 +1084,7 @@ define double @v_mul_neg128_f64(double %x) { ; GFX10-GISEL-LABEL: v_mul_neg128_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX10-GISEL-NEXT: s_mov_b32 s5, 0xc0600000 -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], 0xc0600000, v[0:1] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_neg128_f64: @@ -1132,9 +1096,7 @@ define double @v_mul_neg128_f64(double %x) { ; GFX11-GISEL-LABEL: v_mul_neg128_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s1, 0xc0600000 -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], 0xc0600000, v[0:1] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %mul = fmul double %x, -128.0 ret double %mul @@ -1150,9 +1112,9 @@ define double @v_mul_neg64_f64(double %x) { ; GFX9-GISEL-LABEL: v_mul_neg64_f64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0xc0500000 -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xc0500000 +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_neg64_f64: @@ -1164,9 +1126,7 @@ define double @v_mul_neg64_f64(double %x) { ; GFX10-GISEL-LABEL: v_mul_neg64_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX10-GISEL-NEXT: s_mov_b32 s5, 0xc0500000 -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], 0xc0500000, v[0:1] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_neg64_f64: @@ -1178,9 +1138,7 @@ define double @v_mul_neg64_f64(double %x) { ; GFX11-GISEL-LABEL: v_mul_neg64_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s1, 0xc0500000 -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], 0xc0500000, v[0:1] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %mul = fmul double %x, -64.0 ret double %mul @@ -1196,9 +1154,9 @@ define double @v_mul_neg32_f64(double %x) { ; GFX9-GISEL-LABEL: v_mul_neg32_f64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0xc0400000 -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xc0400000 +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_neg32_f64: @@ -1210,9 +1168,7 @@ define double @v_mul_neg32_f64(double %x) { ; GFX10-GISEL-LABEL: v_mul_neg32_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX10-GISEL-NEXT: s_mov_b32 s5, 0xc0400000 -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], 0xc0400000, v[0:1] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_neg32_f64: @@ -1224,9 +1180,7 @@ define double @v_mul_neg32_f64(double %x) { ; GFX11-GISEL-LABEL: v_mul_neg32_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s1, 0xc0400000 -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], 0xc0400000, v[0:1] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %mul = fmul double %x, -32.0 ret double %mul @@ -1242,9 +1196,9 @@ define double @v_mul_neg16_f64(double %x) { ; GFX9-GISEL-LABEL: v_mul_neg16_f64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0xc0300000 -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xc0300000 +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_neg16_f64: @@ -1256,9 +1210,7 @@ define double @v_mul_neg16_f64(double %x) { ; GFX10-GISEL-LABEL: v_mul_neg16_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX10-GISEL-NEXT: s_mov_b32 s5, 0xc0300000 -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], 0xc0300000, v[0:1] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_neg16_f64: @@ -1270,9 +1222,7 @@ define double @v_mul_neg16_f64(double %x) { ; GFX11-GISEL-LABEL: v_mul_neg16_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s1, 0xc0300000 -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], 0xc0300000, v[0:1] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %mul = fmul double %x, -16.0 ret double %mul @@ -1288,9 +1238,9 @@ define double @v_mul_neg8_f64(double %x) { ; GFX9-GISEL-LABEL: v_mul_neg8_f64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0xc0200000 -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xc0200000 +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_neg8_f64: @@ -1302,9 +1252,7 @@ define double @v_mul_neg8_f64(double %x) { ; GFX10-GISEL-LABEL: v_mul_neg8_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX10-GISEL-NEXT: s_mov_b32 s5, 0xc0200000 -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], 0xc0200000, v[0:1] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_neg8_f64: @@ -1316,9 +1264,7 @@ define double @v_mul_neg8_f64(double %x) { ; GFX11-GISEL-LABEL: v_mul_neg8_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s1, 0xc0200000 -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], 0xc0200000, v[0:1] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %mul = fmul double %x, -8.0 ret double %mul @@ -1414,9 +1360,9 @@ define double @v_mul_neg_quarter_f64(double %x) { ; GFX9-GISEL-LABEL: v_mul_neg_quarter_f64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0xbfd00000 -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xbfd00000 +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_neg_quarter_f64: @@ -1428,9 +1374,7 @@ define double @v_mul_neg_quarter_f64(double %x) { ; GFX10-GISEL-LABEL: v_mul_neg_quarter_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX10-GISEL-NEXT: s_mov_b32 s5, 0xbfd00000 -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], 0xbfd00000, v[0:1] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_neg_quarter_f64: @@ -1442,9 +1386,7 @@ define double @v_mul_neg_quarter_f64(double %x) { ; GFX11-GISEL-LABEL: v_mul_neg_quarter_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s1, 0xbfd00000 -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], 0xbfd00000, v[0:1] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %mul = fmul double %x, -0.25 ret double %mul @@ -1460,9 +1402,9 @@ define double @v_mul_quarter_f64(double %x) { ; GFX9-GISEL-LABEL: v_mul_quarter_f64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0x3fd00000 -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x3fd00000 +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_quarter_f64: @@ -1474,9 +1416,7 @@ define double @v_mul_quarter_f64(double %x) { ; GFX10-GISEL-LABEL: v_mul_quarter_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX10-GISEL-NEXT: s_mov_b32 s5, 0x3fd00000 -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], 0x3fd00000, v[0:1] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_quarter_f64: @@ -1488,9 +1428,7 @@ define double @v_mul_quarter_f64(double %x) { ; GFX11-GISEL-LABEL: v_mul_quarter_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s1, 0x3fd00000 -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], 0x3fd00000, v[0:1] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %mul = fmul double %x, 0.25 ret double %mul @@ -1575,9 +1513,9 @@ define double @v_mul_8_f64(double %x) { ; GFX9-GISEL-LABEL: v_mul_8_f64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0x40200000 -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x40200000 +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_8_f64: @@ -1589,9 +1527,7 @@ define double @v_mul_8_f64(double %x) { ; GFX10-GISEL-LABEL: v_mul_8_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX10-GISEL-NEXT: s_mov_b32 s5, 0x40200000 -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], 0x40200000, v[0:1] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_8_f64: @@ -1603,9 +1539,7 @@ define double @v_mul_8_f64(double %x) { ; GFX11-GISEL-LABEL: v_mul_8_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s1, 0x40200000 -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], 0x40200000, v[0:1] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %mul = fmul double %x, 8.0 ret double %mul @@ -1621,9 +1555,9 @@ define double @v_mul_16_f64(double %x) { ; GFX9-GISEL-LABEL: v_mul_16_f64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0x40300000 -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x40300000 +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_16_f64: @@ -1635,9 +1569,7 @@ define double @v_mul_16_f64(double %x) { ; GFX10-GISEL-LABEL: v_mul_16_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX10-GISEL-NEXT: s_mov_b32 s5, 0x40300000 -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], 0x40300000, v[0:1] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_16_f64: @@ -1649,9 +1581,7 @@ define double @v_mul_16_f64(double %x) { ; GFX11-GISEL-LABEL: v_mul_16_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s1, 0x40300000 -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], 0x40300000, v[0:1] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %mul = fmul double %x, 16.0 ret double %mul @@ -1667,9 +1597,9 @@ define double @v_mul_32_f64(double %x) { ; GFX9-GISEL-LABEL: v_mul_32_f64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0x40400000 -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x40400000 +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_32_f64: @@ -1681,9 +1611,7 @@ define double @v_mul_32_f64(double %x) { ; GFX10-GISEL-LABEL: v_mul_32_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX10-GISEL-NEXT: s_mov_b32 s5, 0x40400000 -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], 0x40400000, v[0:1] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_32_f64: @@ -1695,9 +1623,7 @@ define double @v_mul_32_f64(double %x) { ; GFX11-GISEL-LABEL: v_mul_32_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s1, 0x40400000 -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], 0x40400000, v[0:1] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %mul = fmul double %x, 32.0 ret double %mul @@ -1713,9 +1639,9 @@ define double @v_mul_64_f64(double %x) { ; GFX9-GISEL-LABEL: v_mul_64_f64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0x40500000 -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x40500000 +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_64_f64: @@ -1727,9 +1653,7 @@ define double @v_mul_64_f64(double %x) { ; GFX10-GISEL-LABEL: v_mul_64_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX10-GISEL-NEXT: s_mov_b32 s5, 0x40500000 -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], 0x40500000, v[0:1] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_64_f64: @@ -1741,9 +1665,7 @@ define double @v_mul_64_f64(double %x) { ; GFX11-GISEL-LABEL: v_mul_64_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s1, 0x40500000 -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], 0x40500000, v[0:1] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %mul = fmul double %x, 64.0 ret double %mul @@ -1759,9 +1681,9 @@ define double @v_mul_128_f64(double %x) { ; GFX9-GISEL-LABEL: v_mul_128_f64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0x40600000 -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x40600000 +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_128_f64: @@ -1773,9 +1695,7 @@ define double @v_mul_128_f64(double %x) { ; GFX10-GISEL-LABEL: v_mul_128_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX10-GISEL-NEXT: s_mov_b32 s5, 0x40600000 -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], 0x40600000, v[0:1] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_128_f64: @@ -1787,9 +1707,7 @@ define double @v_mul_128_f64(double %x) { ; GFX11-GISEL-LABEL: v_mul_128_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s1, 0x40600000 -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], 0x40600000, v[0:1] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %mul = fmul double %x, 128.0 ret double %mul @@ -1805,9 +1723,9 @@ define double @v_mul_256_f64(double %x) { ; GFX9-GISEL-LABEL: v_mul_256_f64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0x40700000 -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x40700000 +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_256_f64: @@ -1819,9 +1737,7 @@ define double @v_mul_256_f64(double %x) { ; GFX10-GISEL-LABEL: v_mul_256_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX10-GISEL-NEXT: s_mov_b32 s5, 0x40700000 -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], 0x40700000, v[0:1] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_256_f64: @@ -1833,9 +1749,7 @@ define double @v_mul_256_f64(double %x) { ; GFX11-GISEL-LABEL: v_mul_256_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s1, 0x40700000 -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], 0x40700000, v[0:1] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %mul = fmul double %x, 256.0 ret double %mul @@ -1852,9 +1766,9 @@ define double @v_mul_0x1p63_f64(double %x) { ; GFX9-GISEL-LABEL: v_mul_0x1p63_f64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0x43e00000 -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x43e00000 +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_0x1p63_f64: @@ -1866,9 +1780,7 @@ define double @v_mul_0x1p63_f64(double %x) { ; GFX10-GISEL-LABEL: v_mul_0x1p63_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX10-GISEL-NEXT: s_mov_b32 s5, 0x43e00000 -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], 0x43e00000, v[0:1] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_0x1p63_f64: @@ -1880,9 +1792,7 @@ define double @v_mul_0x1p63_f64(double %x) { ; GFX11-GISEL-LABEL: v_mul_0x1p63_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s1, 0x43e00000 -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], 0x43e00000, v[0:1] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %mul = fmul double %x, 9223372036854775808.0 ret double %mul @@ -1899,9 +1809,9 @@ define double @v_mul_0x1p64_f64(double %x) { ; GFX9-GISEL-LABEL: v_mul_0x1p64_f64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0x43f00000 -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x43f00000 +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_0x1p64_f64: @@ -1913,9 +1823,7 @@ define double @v_mul_0x1p64_f64(double %x) { ; GFX10-GISEL-LABEL: v_mul_0x1p64_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX10-GISEL-NEXT: s_mov_b32 s5, 0x43f00000 -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], 0x43f00000, v[0:1] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_0x1p64_f64: @@ -1927,9 +1835,7 @@ define double @v_mul_0x1p64_f64(double %x) { ; GFX11-GISEL-LABEL: v_mul_0x1p64_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s1, 0x43f00000 -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], 0x43f00000, v[0:1] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %mul = fmul double %x, 18446744073709551616.0 ret double %mul @@ -1947,9 +1853,9 @@ define double @v_mul_0x1p65_f64(double %x) { ; GFX9-GISEL-LABEL: v_mul_0x1p65_f64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_brev_b32 s5, 34 -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-GISEL-NEXT: v_bfrev_b32_e32 v3, 34 +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_0x1p65_f64: @@ -1961,9 +1867,7 @@ define double @v_mul_0x1p65_f64(double %x) { ; GFX10-GISEL-LABEL: v_mul_0x1p65_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX10-GISEL-NEXT: s_brev_b32 s5, 34 -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], 0x44000000, v[0:1] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_0x1p65_f64: @@ -1975,9 +1879,7 @@ define double @v_mul_0x1p65_f64(double %x) { ; GFX11-GISEL-LABEL: v_mul_0x1p65_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_brev_b32 s1, 34 -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], 0x44000000, v[0:1] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %mul = fmul double %x, 36893488147419103232.0 ret double %mul @@ -1994,10 +1896,8 @@ define amdgpu_ps <2 x i32> @s_mul_0x1p65_f64(double inreg %x, double inreg %y) { ; ; GFX9-GISEL-LABEL: s_mul_0x1p65_f64: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_mov_b32 s2, 0 -; GFX9-GISEL-NEXT: s_brev_b32 s3, 34 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-GISEL-NEXT: v_bfrev_b32_e32 v1, 34 ; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], s[0:1], v[0:1] ; GFX9-GISEL-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-GISEL-NEXT: v_readfirstlane_b32 s1, v1 @@ -2012,9 +1912,7 @@ define amdgpu_ps <2 x i32> @s_mul_0x1p65_f64(double inreg %x, double inreg %y) { ; ; GFX10-GISEL-LABEL: s_mul_0x1p65_f64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_mov_b32 s2, 0 -; GFX10-GISEL-NEXT: s_brev_b32 s3, 34 -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], s[0:1], s[2:3] +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], 0x44000000, s[0:1] ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-GISEL-NEXT: ; return to shader part epilog @@ -2028,9 +1926,7 @@ define amdgpu_ps <2 x i32> @s_mul_0x1p65_f64(double inreg %x, double inreg %y) { ; ; GFX11-GISEL-LABEL: s_mul_0x1p65_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_mov_b32 s2, 0 -; GFX11-GISEL-NEXT: s_brev_b32 s3, 34 -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], s[0:1], s[2:3] +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], 0x44000000, s[0:1] ; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v1 ; GFX11-GISEL-NEXT: ; return to shader part epilog @@ -2057,9 +1953,9 @@ define double @v_mul_0x1p128_f64(double %x) { ; GFX9-GISEL-LABEL: v_mul_0x1p128_f64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0x47f00000 -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x47f00000 +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_0x1p128_f64: @@ -2071,9 +1967,7 @@ define double @v_mul_0x1p128_f64(double %x) { ; GFX10-GISEL-LABEL: v_mul_0x1p128_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX10-GISEL-NEXT: s_mov_b32 s5, 0x47f00000 -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], 0x47f00000, v[0:1] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_0x1p128_f64: @@ -2085,9 +1979,7 @@ define double @v_mul_0x1p128_f64(double %x) { ; GFX11-GISEL-LABEL: v_mul_0x1p128_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s1, 0x47f00000 -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], 0x47f00000, v[0:1] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %mul = fmul double %x, 3.40282366920938463463e+38 ret double %mul @@ -2105,9 +1997,9 @@ define double @v_mul_0x1p1022_f64(double %x) { ; GFX9-GISEL-LABEL: v_mul_0x1p1022_f64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0x7fd00000 -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fd00000 +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_0x1p1022_f64: @@ -2119,9 +2011,7 @@ define double @v_mul_0x1p1022_f64(double %x) { ; GFX10-GISEL-LABEL: v_mul_0x1p1022_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX10-GISEL-NEXT: s_mov_b32 s5, 0x7fd00000 -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], 0x7fd00000, v[0:1] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_0x1p1022_f64: @@ -2133,9 +2023,7 @@ define double @v_mul_0x1p1022_f64(double %x) { ; GFX11-GISEL-LABEL: v_mul_0x1p1022_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s1, 0x7fd00000 -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], 0x7fd00000, v[0:1] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %mul = fmul double %x, 4.49423283715578976932e+307 ret double %mul @@ -2153,9 +2041,9 @@ define double @v_mul_0x1p1023_f64(double %x) { ; GFX9-GISEL-LABEL: v_mul_0x1p1023_f64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0x7fe00000 -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fe00000 +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_0x1p1023_f64: @@ -2167,9 +2055,7 @@ define double @v_mul_0x1p1023_f64(double %x) { ; GFX10-GISEL-LABEL: v_mul_0x1p1023_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX10-GISEL-NEXT: s_mov_b32 s5, 0x7fe00000 -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], 0x7fe00000, v[0:1] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_0x1p1023_f64: @@ -2181,9 +2067,7 @@ define double @v_mul_0x1p1023_f64(double %x) { ; GFX11-GISEL-LABEL: v_mul_0x1p1023_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s1, 0x7fe00000 -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], 0x7fe00000, v[0:1] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %mul = fmul double %x, 8.98846567431157953865e+307 ret double %mul @@ -2191,29 +2075,27 @@ define double @v_mul_0x1p1023_f64(double %x) { ; Check that this doesn't interfer with fma formation define double @v_fma_mul_add_32_f64(double %x, double %y) { -; GFX9-LABEL: v_fma_mul_add_32_f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: s_mov_b32 s5, 0x40400000 -; GFX9-NEXT: v_fma_f64 v[0:1], v[0:1], s[4:5], v[2:3] -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-SDAG-LABEL: v_fma_mul_add_32_f64: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: s_mov_b32 s4, 0 +; GFX9-SDAG-NEXT: s_mov_b32 s5, 0x40400000 +; GFX9-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], s[4:5], v[2:3] +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_fma_mul_add_32_f64: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: s_mov_b32 s5, 0x40400000 -; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], s[4:5], v[2:3] -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX9-GISEL-LABEL: v_fma_mul_add_32_f64: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0x40400000 +; GFX9-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[2:3] +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fma_mul_add_32_f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_mov_b32 s1, 0x40400000 -; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], s[0:1], v[2:3] -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX1011-LABEL: v_fma_mul_add_32_f64: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: v_fma_f64 v[0:1], 0x40400000, v[0:1], v[2:3] +; GFX1011-NEXT: s_setpc_b64 s[30:31] %mul = fmul contract double %x, 32.0 %fma = fadd contract double %mul, %y ret double %fma @@ -2229,23 +2111,33 @@ define <2 x double> @v_fma_mul_add_32_v2f64(<2 x double> %x, <2 x double> %y) { ; GFX9-NEXT: v_fma_f64 v[2:3], v[2:3], s[4:5], v[6:7] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_fma_mul_add_32_v2f64: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: s_mov_b32 s5, 0x40400000 -; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], s[4:5], v[4:5] -; GFX10-NEXT: v_fma_f64 v[2:3], v[2:3], s[4:5], v[6:7] -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-SDAG-LABEL: v_fma_mul_add_32_v2f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_fma_f64 v[0:1], 0x40400000, v[0:1], v[4:5] +; GFX10-SDAG-NEXT: v_fma_f64 v[2:3], 0x40400000, v[2:3], v[6:7] +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fma_mul_add_32_v2f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_mov_b32 s1, 0x40400000 -; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], s[0:1], v[4:5] -; GFX11-NEXT: v_fma_f64 v[2:3], v[2:3], s[0:1], v[6:7] -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX10-GISEL-LABEL: v_fma_mul_add_32_v2f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], 0x40400000, v[4:5] +; GFX10-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], 0x40400000, v[6:7] +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_fma_mul_add_32_v2f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_fma_f64 v[0:1], 0x40400000, v[0:1], v[4:5] +; GFX11-SDAG-NEXT: v_fma_f64 v[2:3], 0x40400000, v[2:3], v[6:7] +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_fma_mul_add_32_v2f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], 0x40400000, v[4:5] +; GFX11-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], 0x40400000, v[6:7] +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %mul = fmul contract <2 x double> %x, %fma = fadd contract <2 x double> %mul, %y ret <2 x double> %fma @@ -2319,9 +2211,9 @@ define double @v_mul_add_32_f64(double %x, double %y) { ; GFX9-GISEL-LABEL: v_mul_add_32_f64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0x40400000 -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0x40400000 +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] ; GFX9-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2335,9 +2227,7 @@ define double @v_mul_add_32_f64(double %x, double %y) { ; GFX10-GISEL-LABEL: v_mul_add_32_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX10-GISEL-NEXT: s_mov_b32 s5, 0x40400000 -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], 0x40400000, v[0:1] ; GFX10-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2351,9 +2241,7 @@ define double @v_mul_add_32_f64(double %x, double %y) { ; GFX11-GISEL-LABEL: v_mul_add_32_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s1, 0x40400000 -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], 0x40400000, v[0:1] ; GFX11-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %mul = fmul double %x, 32.0 @@ -2456,58 +2344,54 @@ define double @v_mul_add_4_f64(double %x, double %y) { } define double @v_fma_mul_sub_32_f64(double %x, double %y) { -; GFX9-LABEL: v_fma_mul_sub_32_f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: s_mov_b32 s5, 0x40400000 -; GFX9-NEXT: v_fma_f64 v[0:1], v[0:1], s[4:5], -v[2:3] -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-SDAG-LABEL: v_fma_mul_sub_32_f64: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: s_mov_b32 s4, 0 +; GFX9-SDAG-NEXT: s_mov_b32 s5, 0x40400000 +; GFX9-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], s[4:5], -v[2:3] +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_fma_mul_sub_32_f64: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: s_mov_b32 s5, 0x40400000 -; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], s[4:5], -v[2:3] -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX9-GISEL-LABEL: v_fma_mul_sub_32_f64: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0x40400000 +; GFX9-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], -v[2:3] +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fma_mul_sub_32_f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_mov_b32 s1, 0x40400000 -; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], s[0:1], -v[2:3] -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX1011-LABEL: v_fma_mul_sub_32_f64: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: v_fma_f64 v[0:1], 0x40400000, v[0:1], -v[2:3] +; GFX1011-NEXT: s_setpc_b64 s[30:31] %mul = fmul contract double %x, 32.0 %fma = fsub contract double %mul, %y ret double %fma } define double @v_fma_mul_add_neg32_f64(double %x, double %y) { -; GFX9-LABEL: v_fma_mul_add_neg32_f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: s_mov_b32 s5, 0xc0400000 -; GFX9-NEXT: v_fma_f64 v[0:1], v[0:1], s[4:5], v[2:3] -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-SDAG-LABEL: v_fma_mul_add_neg32_f64: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: s_mov_b32 s4, 0 +; GFX9-SDAG-NEXT: s_mov_b32 s5, 0xc0400000 +; GFX9-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], s[4:5], v[2:3] +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_fma_mul_add_neg32_f64: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: s_mov_b32 s5, 0xc0400000 -; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], s[4:5], v[2:3] -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX9-GISEL-LABEL: v_fma_mul_add_neg32_f64: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0xc0400000 +; GFX9-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[2:3] +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fma_mul_add_neg32_f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_mov_b32 s1, 0xc0400000 -; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], s[0:1], v[2:3] -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX1011-LABEL: v_fma_mul_add_neg32_f64: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: v_fma_f64 v[0:1], 0xc0400000, v[0:1], v[2:3] +; GFX1011-NEXT: s_setpc_b64 s[30:31] %mul = fmul contract double %x, -32.0 %fma = fadd contract double %mul, %y ret double %fma @@ -2523,9 +2407,9 @@ define double @v_mul_fabs_32_f64(double %x) { ; GFX9-GISEL-LABEL: v_mul_fabs_32_f64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0x40400000 -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], |v[0:1]|, s[4:5] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x40400000 +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], |v[0:1]|, v[2:3] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_fabs_32_f64: @@ -2537,9 +2421,7 @@ define double @v_mul_fabs_32_f64(double %x) { ; GFX10-GISEL-LABEL: v_mul_fabs_32_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX10-GISEL-NEXT: s_mov_b32 s5, 0x40400000 -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], |v[0:1]|, s[4:5] +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], 0x40400000, |v[0:1]| ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_fabs_32_f64: @@ -2551,9 +2433,7 @@ define double @v_mul_fabs_32_f64(double %x) { ; GFX11-GISEL-LABEL: v_mul_fabs_32_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s1, 0x40400000 -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], |v[0:1]|, s[0:1] +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], 0x40400000, |v[0:1]| ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %x.fabs = call double @llvm.fabs.f64(double %x) %mul = fmul double %x.fabs, 32.0 @@ -2561,29 +2441,27 @@ define double @v_mul_fabs_32_f64(double %x) { } define double @v_mul_add_fma_fabs_32_f64(double %x, double %y) { -; GFX9-LABEL: v_mul_add_fma_fabs_32_f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: s_mov_b32 s5, 0x40400000 -; GFX9-NEXT: v_fma_f64 v[0:1], |v[0:1]|, s[4:5], v[2:3] -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-SDAG-LABEL: v_mul_add_fma_fabs_32_f64: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: s_mov_b32 s4, 0 +; GFX9-SDAG-NEXT: s_mov_b32 s5, 0x40400000 +; GFX9-SDAG-NEXT: v_fma_f64 v[0:1], |v[0:1]|, s[4:5], v[2:3] +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_mul_add_fma_fabs_32_f64: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: s_mov_b32 s5, 0x40400000 -; GFX10-NEXT: v_fma_f64 v[0:1], |v[0:1]|, s[4:5], v[2:3] -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX9-GISEL-LABEL: v_mul_add_fma_fabs_32_f64: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0x40400000 +; GFX9-GISEL-NEXT: v_fma_f64 v[0:1], |v[0:1]|, v[4:5], v[2:3] +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_mul_add_fma_fabs_32_f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_mov_b32 s1, 0x40400000 -; GFX11-NEXT: v_fma_f64 v[0:1], |v[0:1]|, s[0:1], v[2:3] -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX1011-LABEL: v_mul_add_fma_fabs_32_f64: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: v_fma_f64 v[0:1], 0x40400000, |v[0:1]|, v[2:3] +; GFX1011-NEXT: s_setpc_b64 s[30:31] %x.fabs = call double @llvm.fabs.f64(double %x) %mul = fmul contract double %x.fabs, 32.0 %fma = fadd contract double %mul, %y @@ -2617,10 +2495,8 @@ define <2 x double> @v_mul_16_v2f64(<2 x double> %x) { ; GFX10-GISEL-LABEL: v_mul_16_v2f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX10-GISEL-NEXT: s_mov_b32 s5, 0x40300000 -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] -; GFX10-GISEL-NEXT: v_mul_f64 v[2:3], v[2:3], s[4:5] +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], 0x40300000 +; GFX10-GISEL-NEXT: v_mul_f64 v[2:3], v[2:3], 0x40300000 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_16_v2f64: @@ -2633,10 +2509,8 @@ define <2 x double> @v_mul_16_v2f64(<2 x double> %x) { ; GFX11-GISEL-LABEL: v_mul_16_v2f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s1, 0x40300000 -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] -; GFX11-GISEL-NEXT: v_mul_f64 v[2:3], v[2:3], s[0:1] +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], 0x40300000 +; GFX11-GISEL-NEXT: v_mul_f64 v[2:3], v[2:3], 0x40300000 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %mul = fmul <2 x double> %x, ret <2 x double> %mul @@ -2669,10 +2543,8 @@ define <2 x double> @v_mul_neg16_v2f64(<2 x double> %x) { ; GFX10-GISEL-LABEL: v_mul_neg16_v2f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX10-GISEL-NEXT: s_mov_b32 s5, 0xc0300000 -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] -; GFX10-GISEL-NEXT: v_mul_f64 v[2:3], v[2:3], s[4:5] +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], 0xc0300000 +; GFX10-GISEL-NEXT: v_mul_f64 v[2:3], v[2:3], 0xc0300000 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_neg16_v2f64: @@ -2685,10 +2557,8 @@ define <2 x double> @v_mul_neg16_v2f64(<2 x double> %x) { ; GFX11-GISEL-LABEL: v_mul_neg16_v2f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s1, 0xc0300000 -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] -; GFX11-GISEL-NEXT: v_mul_f64 v[2:3], v[2:3], s[0:1] +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], 0xc0300000 +; GFX11-GISEL-NEXT: v_mul_f64 v[2:3], v[2:3], 0xc0300000 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %mul = fmul <2 x double> %x, ret <2 x double> %mul @@ -2721,10 +2591,8 @@ define <2 x double> @v_mul_fabs_16_v2f64(<2 x double> %x) { ; GFX10-GISEL-LABEL: v_mul_fabs_16_v2f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX10-GISEL-NEXT: s_mov_b32 s5, 0x40300000 -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], |v[0:1]|, s[4:5] -; GFX10-GISEL-NEXT: v_mul_f64 v[2:3], |v[2:3]|, s[4:5] +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], |v[0:1]|, 0x40300000 +; GFX10-GISEL-NEXT: v_mul_f64 v[2:3], |v[2:3]|, 0x40300000 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_fabs_16_v2f64: @@ -2737,10 +2605,8 @@ define <2 x double> @v_mul_fabs_16_v2f64(<2 x double> %x) { ; GFX11-GISEL-LABEL: v_mul_fabs_16_v2f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s1, 0x40300000 -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], |v[0:1]|, s[0:1] -; GFX11-GISEL-NEXT: v_mul_f64 v[2:3], |v[2:3]|, s[0:1] +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], |v[0:1]|, 0x40300000 +; GFX11-GISEL-NEXT: v_mul_f64 v[2:3], |v[2:3]|, 0x40300000 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %x.fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %x) %mul = fmul <2 x double> %x.fabs, @@ -2757,10 +2623,8 @@ define amdgpu_ps <2 x i32> @s_mul_32_f64(double inreg %x, double inreg %y) { ; ; GFX9-GISEL-LABEL: s_mul_32_f64: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_mov_b32 s2, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s3, 0x40400000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0x40400000 ; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], s[0:1], v[0:1] ; GFX9-GISEL-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-GISEL-NEXT: v_readfirstlane_b32 s1, v1 @@ -2775,9 +2639,7 @@ define amdgpu_ps <2 x i32> @s_mul_32_f64(double inreg %x, double inreg %y) { ; ; GFX10-GISEL-LABEL: s_mul_32_f64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_mov_b32 s2, 0 -; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x40400000 -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], s[0:1], s[2:3] +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], 0x40400000, s[0:1] ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-GISEL-NEXT: ; return to shader part epilog @@ -2791,9 +2653,7 @@ define amdgpu_ps <2 x i32> @s_mul_32_f64(double inreg %x, double inreg %y) { ; ; GFX11-GISEL-LABEL: s_mul_32_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_mov_b32 s2, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x40400000 -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], s[0:1], s[2:3] +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], 0x40400000, s[0:1] ; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v1 ; GFX11-GISEL-NEXT: ; return to shader part epilog @@ -6885,9 +6745,9 @@ define double @v_constrained_fmul_32_f64(double %x, double %y) #0 { ; GFX9-GISEL-LABEL: v_constrained_fmul_32_f64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0x40400000 -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x40400000 +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_constrained_fmul_32_f64: @@ -6899,9 +6759,7 @@ define double @v_constrained_fmul_32_f64(double %x, double %y) #0 { ; GFX10-GISEL-LABEL: v_constrained_fmul_32_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX10-GISEL-NEXT: s_mov_b32 s5, 0x40400000 -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], 0x40400000 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_constrained_fmul_32_f64: @@ -6913,9 +6771,7 @@ define double @v_constrained_fmul_32_f64(double %x, double %y) #0 { ; GFX11-GISEL-LABEL: v_constrained_fmul_32_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s1, 0x40400000 -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], 0x40400000 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %val = call double @llvm.experimental.constrained.fmul.f64(double %x, double 32.0, metadata !"round.dynamic", metadata !"fpexcept.strict") ret double %val @@ -6931,9 +6787,9 @@ define double @v_constrained_fmul_0x1p64_f64(double %x, double %y) #0 { ; GFX9-GISEL-LABEL: v_constrained_fmul_0x1p64_f64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0x43f00000 -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x43f00000 +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_constrained_fmul_0x1p64_f64: @@ -6945,9 +6801,7 @@ define double @v_constrained_fmul_0x1p64_f64(double %x, double %y) #0 { ; GFX10-GISEL-LABEL: v_constrained_fmul_0x1p64_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX10-GISEL-NEXT: s_mov_b32 s5, 0x43f00000 -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], 0x43f00000 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_constrained_fmul_0x1p64_f64: @@ -6959,9 +6813,7 @@ define double @v_constrained_fmul_0x1p64_f64(double %x, double %y) #0 { ; GFX11-GISEL-LABEL: v_constrained_fmul_0x1p64_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s1, 0x43f00000 -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], 0x43f00000 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %val = call double @llvm.experimental.constrained.fmul.f64(double %x, double 18446744073709551616.0, metadata !"round.dynamic", metadata !"fpexcept.strict") ret double %val @@ -6988,9 +6840,9 @@ define double @v_mul_fabs_0x1pn1031_f64(double %x) { ; GFX9-GISEL-LABEL: v_mul_fabs_0x1pn1031_f64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_movk_i32 s5, 0x800 -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], |v[0:1]|, s[4:5] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x800 +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], |v[0:1]|, v[2:3] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_fabs_0x1pn1031_f64: @@ -7002,9 +6854,7 @@ define double @v_mul_fabs_0x1pn1031_f64(double %x) { ; GFX10-GISEL-LABEL: v_mul_fabs_0x1pn1031_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX10-GISEL-NEXT: s_movk_i32 s5, 0x800 -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], |v[0:1]|, s[4:5] +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], 0x800, |v[0:1]| ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_fabs_0x1pn1031_f64: @@ -7016,9 +6866,7 @@ define double @v_mul_fabs_0x1pn1031_f64(double %x) { ; GFX11-GISEL-LABEL: v_mul_fabs_0x1pn1031_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_movk_i32 s1, 0x800 -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], |v[0:1]|, s[0:1] +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], 0x800, |v[0:1]| ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %fabs.x = call double @llvm.fabs.f64(double %x) %mul = fmul double %fabs.x, 4.34584737989687770135e-311 @@ -7035,9 +6883,9 @@ define double @v_mul_fabs_neg256_f64(double %x) { ; GFX9-GISEL-LABEL: v_mul_fabs_neg256_f64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0xc0700000 -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], |v[0:1]|, s[4:5] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xc0700000 +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], |v[0:1]|, v[2:3] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_fabs_neg256_f64: @@ -7049,9 +6897,7 @@ define double @v_mul_fabs_neg256_f64(double %x) { ; GFX10-GISEL-LABEL: v_mul_fabs_neg256_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX10-GISEL-NEXT: s_mov_b32 s5, 0xc0700000 -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], |v[0:1]|, s[4:5] +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], 0xc0700000, |v[0:1]| ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_fabs_neg256_f64: @@ -7063,9 +6909,7 @@ define double @v_mul_fabs_neg256_f64(double %x) { ; GFX11-GISEL-LABEL: v_mul_fabs_neg256_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s1, 0xc0700000 -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], |v[0:1]|, s[0:1] +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], 0xc0700000, |v[0:1]| ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %fabs.x = call double @llvm.fabs.f64(double %x) %mul = fmul double %fabs.x, -256.0 @@ -7082,9 +6926,9 @@ define double @v_mul_fabs_neg8_f64(double %x) { ; GFX9-GISEL-LABEL: v_mul_fabs_neg8_f64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0xc0200000 -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], |v[0:1]|, s[4:5] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xc0200000 +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], |v[0:1]|, v[2:3] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_fabs_neg8_f64: @@ -7096,9 +6940,7 @@ define double @v_mul_fabs_neg8_f64(double %x) { ; GFX10-GISEL-LABEL: v_mul_fabs_neg8_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX10-GISEL-NEXT: s_mov_b32 s5, 0xc0200000 -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], |v[0:1]|, s[4:5] +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], 0xc0200000, |v[0:1]| ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_fabs_neg8_f64: @@ -7110,9 +6952,7 @@ define double @v_mul_fabs_neg8_f64(double %x) { ; GFX11-GISEL-LABEL: v_mul_fabs_neg8_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s1, 0xc0200000 -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], |v[0:1]|, s[0:1] +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], 0xc0200000, |v[0:1]| ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %fabs.x = call double @llvm.fabs.f64(double %x) %mul = fmul double %fabs.x, -8.0 @@ -7203,9 +7043,9 @@ define double @v_mul_fabs_negquarter_f64(double %x) { ; GFX9-GISEL-LABEL: v_mul_fabs_negquarter_f64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0xbfd00000 -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], |v[0:1]|, s[4:5] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xbfd00000 +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], |v[0:1]|, v[2:3] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_fabs_negquarter_f64: @@ -7217,9 +7057,7 @@ define double @v_mul_fabs_negquarter_f64(double %x) { ; GFX10-GISEL-LABEL: v_mul_fabs_negquarter_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX10-GISEL-NEXT: s_mov_b32 s5, 0xbfd00000 -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], |v[0:1]|, s[4:5] +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], 0xbfd00000, |v[0:1]| ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_fabs_negquarter_f64: @@ -7231,9 +7069,7 @@ define double @v_mul_fabs_negquarter_f64(double %x) { ; GFX11-GISEL-LABEL: v_mul_fabs_negquarter_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s1, 0xbfd00000 -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], |v[0:1]|, s[0:1] +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], 0xbfd00000, |v[0:1]| ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %fabs.x = call double @llvm.fabs.f64(double %x) %mul = fmul double %fabs.x, -0.25 @@ -7250,9 +7086,9 @@ define double @v_mul_fabs_quarter_f64(double %x) { ; GFX9-GISEL-LABEL: v_mul_fabs_quarter_f64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0x3fd00000 -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], |v[0:1]|, s[4:5] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x3fd00000 +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], |v[0:1]|, v[2:3] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_fabs_quarter_f64: @@ -7264,9 +7100,7 @@ define double @v_mul_fabs_quarter_f64(double %x) { ; GFX10-GISEL-LABEL: v_mul_fabs_quarter_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX10-GISEL-NEXT: s_mov_b32 s5, 0x3fd00000 -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], |v[0:1]|, s[4:5] +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], 0x3fd00000, |v[0:1]| ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_fabs_quarter_f64: @@ -7278,9 +7112,7 @@ define double @v_mul_fabs_quarter_f64(double %x) { ; GFX11-GISEL-LABEL: v_mul_fabs_quarter_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s1, 0x3fd00000 -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], |v[0:1]|, s[0:1] +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], 0x3fd00000, |v[0:1]| ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %fabs.x = call double @llvm.fabs.f64(double %x) %mul = fmul double %fabs.x, 0.25 @@ -7371,9 +7203,9 @@ define double @v_mul_fabs_8_f64(double %x) { ; GFX9-GISEL-LABEL: v_mul_fabs_8_f64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0x40200000 -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], |v[0:1]|, s[4:5] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x40200000 +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], |v[0:1]|, v[2:3] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_fabs_8_f64: @@ -7385,9 +7217,7 @@ define double @v_mul_fabs_8_f64(double %x) { ; GFX10-GISEL-LABEL: v_mul_fabs_8_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX10-GISEL-NEXT: s_mov_b32 s5, 0x40200000 -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], |v[0:1]|, s[4:5] +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], 0x40200000, |v[0:1]| ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_fabs_8_f64: @@ -7399,9 +7229,7 @@ define double @v_mul_fabs_8_f64(double %x) { ; GFX11-GISEL-LABEL: v_mul_fabs_8_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s1, 0x40200000 -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], |v[0:1]|, s[0:1] +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], 0x40200000, |v[0:1]| ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %fabs.x = call double @llvm.fabs.f64(double %x) %mul = fmul double %fabs.x, 8.0 diff --git a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll index d4c830c55030d..b60780db77378 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll @@ -435,21 +435,17 @@ define double @fmul_pow_shl_cnt(i64 %cnt) nounwind { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 1 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: s_mov_b32 s5, 0x40220000 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[1:2], v1 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[3:4], v0 ; GFX10-NEXT: v_ldexp_f64 v[0:1], v[1:2], 32 ; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] -; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX10-NEXT: v_mul_f64 v[0:1], 0x40220000, v[0:1] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fmul_pow_shl_cnt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 1 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_mov_b32 s1, 0x40220000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cvt_f64_u32_e32 v[1:2], v1 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[3:4], v0 @@ -457,7 +453,7 @@ define double @fmul_pow_shl_cnt(i64 %cnt) nounwind { ; GFX11-NEXT: v_ldexp_f64 v[0:1], v[1:2], 32 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] +; GFX11-NEXT: v_mul_f64 v[0:1], 0x40220000, v[0:1] ; GFX11-NEXT: s_setpc_b64 s[30:31] %shl = shl nuw i64 1, %cnt %conv = uitofp i64 %shl to double @@ -529,21 +525,17 @@ define double @fmul_pow_shl_cnt2(i64 %cnt) nounwind { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: s_mov_b32 s5, 0xc0220000 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[1:2], v1 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[3:4], v0 ; GFX10-NEXT: v_ldexp_f64 v[0:1], v[1:2], 32 ; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] -; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX10-NEXT: v_mul_f64 v[0:1], 0xc0220000, v[0:1] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fmul_pow_shl_cnt2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_mov_b32 s1, 0xc0220000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cvt_f64_u32_e32 v[1:2], v1 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[3:4], v0 @@ -551,7 +543,7 @@ define double @fmul_pow_shl_cnt2(i64 %cnt) nounwind { ; GFX11-NEXT: v_ldexp_f64 v[0:1], v[1:2], 32 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] +; GFX11-NEXT: v_mul_f64 v[0:1], 0xc0220000, v[0:1] ; GFX11-NEXT: s_setpc_b64 s[30:31] %shl = shl nuw i64 2, %cnt %conv = uitofp i64 %shl to double @@ -721,8 +713,7 @@ define float @fmul_fly_pow_mul_min_pow2(i64 %cnt) nounwind { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 8 -; GFX10-NEXT: s_mov_b64 s[4:5], 0x2000 -; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, 0x2000, v[0:1] ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x2000, v0, vcc_lo ; GFX10-NEXT: v_ffbh_u32_e32 v2, v1 @@ -740,23 +731,22 @@ define float @fmul_fly_pow_mul_min_pow2(i64 %cnt) nounwind { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 8 -; GFX11-NEXT: s_mov_b64 s[0:1], 0x2000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, 0x2000, v[0:1] ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x2000, v0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_clz_i32_u32_e32 v2, v1 -; GFX11-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mul_f32_e32 v0, 0x41100000, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %shl8 = shl nuw i64 8, %cnt @@ -838,24 +828,20 @@ define double @fmul_pow_mul_max_pow2(i16 %cnt) nounwind { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b16 v0, v0, 2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: s_mov_b32 s5, 0x40080000 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 -; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX10-NEXT: v_mul_f64 v[0:1], 0x40080000, v[0:1] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fmul_pow_mul_max_pow2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b16 v0, v0, 2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_mov_b32 s1, 0x40080000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] +; GFX11-NEXT: v_mul_f64 v[0:1], 0x40080000, v[0:1] ; GFX11-NEXT: s_setpc_b64 s[30:31] %shl2 = shl nuw i16 2, %cnt %shl1 = shl nuw i16 1, %cnt @@ -925,21 +911,17 @@ define double @fmul_pow_shl_cnt_fail_maybe_non_pow2(i64 %v, i64 %cnt) nounwind { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: s_mov_b32 s5, 0x40220000 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[1:2], v1 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[3:4], v0 ; GFX10-NEXT: v_ldexp_f64 v[0:1], v[1:2], 32 ; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] -; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; GFX10-NEXT: v_mul_f64 v[0:1], 0x40220000, v[0:1] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_mov_b32 s1, 0x40220000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cvt_f64_u32_e32 v[1:2], v1 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[3:4], v0 @@ -947,7 +929,7 @@ define double @fmul_pow_shl_cnt_fail_maybe_non_pow2(i64 %v, i64 %cnt) nounwind { ; GFX11-NEXT: v_ldexp_f64 v[0:1], v[1:2], 32 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] +; GFX11-NEXT: v_mul_f64 v[0:1], 0x40220000, v[0:1] ; GFX11-NEXT: s_setpc_b64 s[30:31] %shl = shl nuw i64 %v, %cnt %conv = uitofp i64 %shl to double @@ -1206,8 +1188,6 @@ define <2 x double> @fmul_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 2 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], v2, 2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: s_mov_b32 s5, 0x402e0000 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[4:5], v1 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[6:7], v3 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 @@ -1216,8 +1196,8 @@ define <2 x double> @fmul_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind { ; GFX10-NEXT: v_ldexp_f64 v[5:6], v[6:7], 32 ; GFX10-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1] ; GFX10-NEXT: v_add_f64 v[2:3], v[5:6], v[8:9] -; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] -; GFX10-NEXT: v_mul_f64 v[2:3], v[2:3], s[4:5] +; GFX10-NEXT: v_mul_f64 v[0:1], 0x402e0000, v[0:1] +; GFX10-NEXT: v_mul_f64 v[2:3], 0x402e0000, v[2:3] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fmul_pow_shl_cnt_vec: @@ -1225,11 +1205,10 @@ define <2 x double> @fmul_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 2 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], v2, 2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_mov_b32 s1, 0x402e0000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cvt_f64_u32_e32 v[4:5], v1 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[6:7], v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[8:9], v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) @@ -1239,8 +1218,8 @@ define <2 x double> @fmul_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind { ; GFX11-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1] ; GFX11-NEXT: v_add_f64 v[2:3], v[5:6], v[8:9] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] -; GFX11-NEXT: v_mul_f64 v[2:3], v[2:3], s[0:1] +; GFX11-NEXT: v_mul_f64 v[0:1], 0x402e0000, v[0:1] +; GFX11-NEXT: v_mul_f64 v[2:3], 0x402e0000, v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] %shl = shl nsw nuw <2 x i64> , %cnt %conv = uitofp <2 x i64> %shl to <2 x double> @@ -1435,20 +1414,21 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat_todo(<2 x i64> %cnt) nounwin ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshlrev_b64 v[0:1], v0, 2 -; VI-NEXT: v_lshlrev_b64 v[2:3], v2, 2 -; VI-NEXT: v_cvt_f64_u32_e32 v[4:5], v1 -; VI-NEXT: v_cvt_f64_u32_e32 v[6:7], v3 -; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: v_cvt_f64_u32_e32 v[3:4], v1 +; VI-NEXT: v_lshlrev_b64 v[1:2], v2, 2 ; VI-NEXT: s_mov_b32 s5, 0x402e0000 -; VI-NEXT: v_ldexp_f64 v[3:4], v[4:5], 32 -; VI-NEXT: v_ldexp_f64 v[5:6], v[6:7], 32 -; VI-NEXT: v_cvt_f64_u32_e32 v[7:8], v2 +; VI-NEXT: v_cvt_f64_u32_e32 v[5:6], v2 +; VI-NEXT: v_ldexp_f64 v[2:3], v[3:4], 32 +; VI-NEXT: v_ldexp_f64 v[4:5], v[5:6], 32 +; VI-NEXT: v_cvt_f64_u32_e32 v[6:7], v0 +; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], v1 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], v[6:7] +; VI-NEXT: v_add_f64 v[4:5], v[4:5], v[0:1] +; VI-NEXT: v_mul_f64 v[0:1], v[2:3], s[4:5] ; VI-NEXT: s_mov_b32 s4, 0 -; VI-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1] -; VI-NEXT: v_add_f64 v[2:3], v[5:6], v[7:8] -; VI-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] ; VI-NEXT: s_mov_b32 s5, 0x402c0000 -; VI-NEXT: v_mul_f64 v[2:3], v[2:3], s[4:5] +; VI-NEXT: v_mul_f64 v[2:3], v[4:5], s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo: @@ -1456,8 +1436,6 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat_todo(<2 x i64> %cnt) nounwin ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 2 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], v2, 2 -; GFX10-NEXT: s_mov_b32 s5, 0x402e0000 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[4:5], v1 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[6:7], v3 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 @@ -1466,9 +1444,8 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat_todo(<2 x i64> %cnt) nounwin ; GFX10-NEXT: v_ldexp_f64 v[5:6], v[6:7], 32 ; GFX10-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1] ; GFX10-NEXT: v_add_f64 v[2:3], v[5:6], v[8:9] -; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] -; GFX10-NEXT: s_mov_b32 s5, 0x402c0000 -; GFX10-NEXT: v_mul_f64 v[2:3], v[2:3], s[4:5] +; GFX10-NEXT: v_mul_f64 v[0:1], 0x402e0000, v[0:1] +; GFX10-NEXT: v_mul_f64 v[2:3], 0x402c0000, v[2:3] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo: @@ -1476,11 +1453,10 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat_todo(<2 x i64> %cnt) nounwin ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 2 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], v2, 2 -; GFX11-NEXT: s_mov_b32 s1, 0x402e0000 -; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cvt_f64_u32_e32 v[4:5], v1 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[6:7], v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[8:9], v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) @@ -1489,11 +1465,9 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat_todo(<2 x i64> %cnt) nounwin ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1] ; GFX11-NEXT: v_add_f64 v[2:3], v[5:6], v[8:9] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] -; GFX11-NEXT: s_mov_b32 s1, 0x402c0000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_mul_f64 v[2:3], v[2:3], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_f64 v[0:1], 0x402e0000, v[0:1] +; GFX11-NEXT: v_mul_f64 v[2:3], 0x402c0000, v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] %shl = shl nsw nuw <2 x i64> , %cnt %conv = uitofp <2 x i64> %shl to <2 x double> @@ -1580,8 +1554,6 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat2_todo(<2 x i64> %cnt) nounwi ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 2 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], v2, 1 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: s_mov_b32 s5, 0x402e0000 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[4:5], v1 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[6:7], v3 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 @@ -1590,8 +1562,8 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat2_todo(<2 x i64> %cnt) nounwi ; GFX10-NEXT: v_ldexp_f64 v[5:6], v[6:7], 32 ; GFX10-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1] ; GFX10-NEXT: v_add_f64 v[2:3], v[5:6], v[8:9] -; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] -; GFX10-NEXT: v_mul_f64 v[2:3], v[2:3], s[4:5] +; GFX10-NEXT: v_mul_f64 v[0:1], 0x402e0000, v[0:1] +; GFX10-NEXT: v_mul_f64 v[2:3], 0x402e0000, v[2:3] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo: @@ -1599,11 +1571,10 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat2_todo(<2 x i64> %cnt) nounwi ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 2 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], v2, 1 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_mov_b32 s1, 0x402e0000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cvt_f64_u32_e32 v[4:5], v1 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[6:7], v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[8:9], v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) @@ -1613,8 +1584,8 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat2_todo(<2 x i64> %cnt) nounwi ; GFX11-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1] ; GFX11-NEXT: v_add_f64 v[2:3], v[5:6], v[8:9] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] -; GFX11-NEXT: v_mul_f64 v[2:3], v[2:3], s[0:1] +; GFX11-NEXT: v_mul_f64 v[0:1], 0x402e0000, v[0:1] +; GFX11-NEXT: v_mul_f64 v[2:3], 0x402e0000, v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] %shl = shl nsw nuw <2 x i64> , %cnt %conv = uitofp <2 x i64> %shl to <2 x double> diff --git a/llvm/test/CodeGen/AMDGPU/fold-short-64-bit-literals.mir b/llvm/test/CodeGen/AMDGPU/fold-short-64-bit-literals.mir index 6e975c8a53707..69c6a858162f3 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-short-64-bit-literals.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-short-64-bit-literals.mir @@ -84,6 +84,9 @@ body: | SI_RETURN_TO_EPILOG %2 ... +# FIXME: This could be folded, but we do not know if operand of S_AND_B64 is signed or unsigned +# and if it will be sign or zero extended. + --- name: fold_uint_32bit_literal_sgpr tracksRegLiveness: true @@ -92,7 +95,8 @@ body: | ; GCN-LABEL: name: fold_uint_32bit_literal_sgpr ; GCN: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[DEF]], 4294967295, implicit-def $scc + ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 4294967295 + ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[DEF]], [[S_MOV_B64_]], implicit-def $scc ; GCN-NEXT: SI_RETURN_TO_EPILOG [[S_AND_B64_]] %0:sreg_64 = IMPLICIT_DEF %1:sreg_64 = S_MOV_B64 4294967295 diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll index 58b0a0f56918b..fdc8c908ded54 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -1561,12 +1561,12 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat: ; GFX940: ; %bb.0: ; %main_body ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] sc1 +; GFX940-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_endpgm @@ -1579,12 +1579,12 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] +; GFX90A-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: s_endpgm @@ -1592,12 +1592,12 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 { ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent: ; GFX940: ; %bb.0: ; %main_body ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] +; GFX940-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_endpgm @@ -1637,12 +1637,12 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_system: ; GFX940: ; %bb.0: ; %main_body ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] sc1 +; GFX940-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_endpgm @@ -1838,12 +1838,12 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe: ; GFX940: ; %bb.0: ; %main_body ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] +; GFX940-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll index 055cfbdcc1ea3..3a0b8259d0849 100644 --- a/llvm/test/CodeGen/AMDGPU/fract-match.ll +++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll @@ -1821,14 +1821,14 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) nocapture writeon ; GFX6-NEXT: v_add_f64 v[4:5], v[0:1], -v[4:5] ; GFX6-NEXT: s_mov_b32 s9, 0x3fefffff ; GFX6-NEXT: v_add_f64 v[6:7], v[0:1], -v[4:5] -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_min_f64 v[6:7], v[6:7], s[8:9] ; GFX6-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX6-NEXT: v_min_f64 v[6:7], v[6:7], s[8:9] +; GFX6-NEXT: s_mov_b32 s8, 0 ; GFX6-NEXT: s_mov_b32 s9, 0x7ff00000 -; GFX6-NEXT: s_mov_b32 s8, s6 ; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc ; GFX6-NEXT: v_cmp_neq_f64_e64 vcc, |v[0:1]|, s[8:9] +; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 @@ -1841,13 +1841,14 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) nocapture writeon ; GFX7-LABEL: safe_math_fract_f64: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s4, 0 ; GFX7-NEXT: s_mov_b32 s5, 0x7ff00000 -; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: v_fract_f64_e32 v[4:5], v[0:1] ; GFX7-NEXT: v_cmp_neq_f64_e64 vcc, |v[0:1]|, s[4:5] ; GFX7-NEXT: v_floor_f64_e32 v[6:7], v[0:1] +; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc @@ -1872,10 +1873,8 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) nocapture writeon ; GFX11-LABEL: safe_math_fract_f64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_mov_b32 s1, 0x7ff00000 ; GFX11-NEXT: v_fract_f64_e32 v[4:5], v[0:1] -; GFX11-NEXT: v_cmp_neq_f64_e64 vcc_lo, |v[0:1]|, s[0:1] +; GFX11-NEXT: v_cmp_neq_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]| ; GFX11-NEXT: v_floor_f64_e32 v[6:7], v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_dual_cndmask_b32 v0, 0, v4 :: v_dual_cndmask_b32 v1, 0, v5 diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll index 8bb8f6c464cd0..196a3705ac818 100644 --- a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel=0 -march=amdgcn -mcpu=pitcairn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG %s -; RUN: llc -global-isel=0 -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG %s +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=pitcairn -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG %s +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG %s -; RUN: llc -global-isel=1 -march=amdgcn -mcpu=pitcairn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GISEL %s -; RUN: llc -global-isel=1 -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GISEL %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=pitcairn -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL %s define double @v_sqrt_f64(double %x) { ; SDAG-LABEL: v_sqrt_f64: @@ -37,11 +37,11 @@ define double @v_sqrt_f64(double %x) { ; GISEL-LABEL: v_sqrt_f64: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0 -; GISEL-NEXT: s_brev_b32 s5, 8 -; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 @@ -97,11 +97,11 @@ define double @v_sqrt_f64_fneg(double %x) { ; GISEL-LABEL: v_sqrt_f64_fneg: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0 -; GISEL-NEXT: s_brev_b32 s5, 8 -; GISEL-NEXT: v_cmp_lt_f64_e64 vcc, -v[0:1], s[4:5] -; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; GISEL-NEXT: v_cmp_lt_f64_e64 vcc, -v[0:1], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GISEL-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2 ; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 @@ -158,11 +158,11 @@ define double @v_sqrt_f64_fabs(double %x) { ; GISEL-LABEL: v_sqrt_f64_fabs: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0 -; GISEL-NEXT: s_brev_b32 s5, 8 -; GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] -; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GISEL-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2 ; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 @@ -219,11 +219,11 @@ define double @v_sqrt_f64_fneg_fabs(double %x) { ; GISEL-LABEL: v_sqrt_f64_fneg_fabs: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0 -; GISEL-NEXT: s_brev_b32 s5, 8 -; GISEL-NEXT: v_cmp_lt_f64_e64 vcc, -|v[0:1]|, s[4:5] -; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; GISEL-NEXT: v_cmp_lt_f64_e64 vcc, -|v[0:1]|, v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GISEL-NEXT: v_ldexp_f64 v[0:1], -|v[0:1]|, v2 ; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 @@ -281,11 +281,11 @@ define double @v_sqrt_f64_ninf(double %x) { ; GISEL-LABEL: v_sqrt_f64_ninf: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0 -; GISEL-NEXT: s_brev_b32 s5, 8 -; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 @@ -341,11 +341,11 @@ define double @v_sqrt_f64_no_infs_attribute(double %x) "no-infs-fp-math"="true" ; GISEL-LABEL: v_sqrt_f64_no_infs_attribute: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0 -; GISEL-NEXT: s_brev_b32 s5, 8 -; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 @@ -401,11 +401,11 @@ define double @v_sqrt_f64_nnan(double %x) { ; GISEL-LABEL: v_sqrt_f64_nnan: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0 -; GISEL-NEXT: s_brev_b32 s5, 8 -; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 @@ -461,10 +461,8 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64(double inreg %x) { ; ; GISEL-LABEL: s_sqrt_f64: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_mov_b32 s2, 0 -; GISEL-NEXT: s_brev_b32 s3, 8 -; GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_bfrev_b32_e32 v1, 8 ; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] ; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 ; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc @@ -532,10 +530,8 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_ninf(double inreg %x) { ; ; GISEL-LABEL: s_sqrt_f64_ninf: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_mov_b32 s2, 0 -; GISEL-NEXT: s_brev_b32 s3, 8 -; GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_bfrev_b32_e32 v1, 8 ; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] ; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 ; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc @@ -603,10 +599,8 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_afn(double inreg %x) { ; ; GISEL-LABEL: s_sqrt_f64_afn: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_mov_b32 s2, 0 -; GISEL-NEXT: s_brev_b32 s3, 8 -; GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_bfrev_b32_e32 v1, 8 ; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] ; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 ; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc @@ -674,10 +668,8 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_afn_nnan_ninf(double inreg %x) { ; ; GISEL-LABEL: s_sqrt_f64_afn_nnan_ninf: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_mov_b32 s2, 0 -; GISEL-NEXT: s_brev_b32 s3, 8 -; GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_bfrev_b32_e32 v1, 8 ; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] ; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 ; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc @@ -745,11 +737,11 @@ define double @v_sqrt_f64_nsz(double %x) { ; GISEL-LABEL: v_sqrt_f64_nsz: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0 -; GISEL-NEXT: s_brev_b32 s5, 8 -; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 @@ -805,11 +797,11 @@ define double @v_sqrt_f64_nnan_ninf(double %x) { ; GISEL-LABEL: v_sqrt_f64_nnan_ninf: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0 -; GISEL-NEXT: s_brev_b32 s5, 8 -; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 @@ -865,11 +857,11 @@ define double @v_sqrt_f64_nnan_ninf_nsz(double %x) { ; GISEL-LABEL: v_sqrt_f64_nnan_ninf_nsz: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0 -; GISEL-NEXT: s_brev_b32 s5, 8 -; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 @@ -925,11 +917,11 @@ define double @v_sqrt_f64_afn(double %x) { ; GISEL-LABEL: v_sqrt_f64_afn: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0 -; GISEL-NEXT: s_brev_b32 s5, 8 -; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 @@ -985,11 +977,11 @@ define double @v_sqrt_f64_afn_nsz(double %x) { ; GISEL-LABEL: v_sqrt_f64_afn_nsz: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0 -; GISEL-NEXT: s_brev_b32 s5, 8 -; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 @@ -1066,12 +1058,14 @@ define <2 x double> @v_sqrt_v2f64_afn(<2 x double> %x) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: s_mov_b32 s4, 0 ; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_mov_b32_e32 v4, s4 +; GISEL-NEXT: v_mov_b32_e32 v5, s5 ; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3] -; GISEL-NEXT: v_mov_b32_e32 v4, 0x100 -; GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5] -; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v5 +; GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5] +; GISEL-NEXT: v_mov_b32_e32 v6, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v6, s[4:5] +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v7 ; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 ; GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] ; GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] @@ -1142,11 +1136,11 @@ define double @v_sqrt_f64_afn_nnan(double %x) { ; GISEL-LABEL: v_sqrt_f64_afn_nnan: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0 -; GISEL-NEXT: s_brev_b32 s5, 8 -; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 @@ -1202,11 +1196,11 @@ define double @v_sqrt_f64_fabs_afn_ninf(double %x) { ; GISEL-LABEL: v_sqrt_f64_fabs_afn_ninf: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0 -; GISEL-NEXT: s_brev_b32 s5, 8 -; GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] -; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GISEL-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2 ; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 @@ -1263,11 +1257,11 @@ define double @v_sqrt_f64_afn_nnan_ninf(double %x) { ; GISEL-LABEL: v_sqrt_f64_afn_nnan_ninf: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0 -; GISEL-NEXT: s_brev_b32 s5, 8 -; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 @@ -1344,12 +1338,14 @@ define <2 x double> @v_sqrt_v2f64_afn_nnan_ninf(<2 x double> %x) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: s_mov_b32 s4, 0 ; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_mov_b32_e32 v4, s4 +; GISEL-NEXT: v_mov_b32_e32 v5, s5 ; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3] -; GISEL-NEXT: v_mov_b32_e32 v4, 0x100 -; GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5] -; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v5 +; GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5] +; GISEL-NEXT: v_mov_b32_e32 v6, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v6, s[4:5] +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v7 ; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 ; GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] ; GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] @@ -1420,11 +1416,11 @@ define double @v_sqrt_f64_afn_nnan_ninf_nsz(double %x) { ; GISEL-LABEL: v_sqrt_f64_afn_nnan_ninf_nsz: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0 -; GISEL-NEXT: s_brev_b32 s5, 8 -; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 @@ -1480,11 +1476,11 @@ define double @v_sqrt_f64__approx_func_fp_math(double %x) #2 { ; GISEL-LABEL: v_sqrt_f64__approx_func_fp_math: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0 -; GISEL-NEXT: s_brev_b32 s5, 8 -; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 @@ -1540,11 +1536,11 @@ define double @v_sqrt_f64__enough_unsafe_attrs(double %x) #3 { ; GISEL-LABEL: v_sqrt_f64__enough_unsafe_attrs: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0 -; GISEL-NEXT: s_brev_b32 s5, 8 -; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 @@ -1600,11 +1596,11 @@ define double @v_sqrt_f64__unsafe_attr(double %x) #4 { ; GISEL-LABEL: v_sqrt_f64__unsafe_attr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0 -; GISEL-NEXT: s_brev_b32 s5, 8 -; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 @@ -1681,12 +1677,14 @@ define <2 x double> @v_sqrt_v2f64(<2 x double> %x) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: s_mov_b32 s4, 0 ; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_mov_b32_e32 v4, s4 +; GISEL-NEXT: v_mov_b32_e32 v5, s5 ; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3] -; GISEL-NEXT: v_mov_b32_e32 v4, 0x100 -; GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5] -; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v5 +; GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5] +; GISEL-NEXT: v_mov_b32_e32 v6, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v6, s[4:5] +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v7 ; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 ; GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] ; GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] @@ -1795,17 +1793,19 @@ define <3 x double> @v_sqrt_v3f64(<3 x double> %x) { ; GISEL-LABEL: v_sqrt_v3f64: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s6, 0 -; GISEL-NEXT: s_brev_b32 s7, 8 -; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[6:7], v[0:1] -; GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[6:7], v[2:3] -; GISEL-NEXT: v_cmp_gt_f64_e64 s[6:7], s[6:7], v[4:5] -; GISEL-NEXT: v_mov_b32_e32 v6, 0x100 -; GISEL-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc -; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, v6, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v6, s[6:7] -; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v7 +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_brev_b32 s5, 8 +; GISEL-NEXT: v_mov_b32_e32 v6, s4 +; GISEL-NEXT: v_mov_b32_e32 v7, s5 +; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[6:7] +; GISEL-NEXT: v_cmp_lt_f64_e64 s[6:7], v[4:5], v[6:7] +; GISEL-NEXT: v_mov_b32_e32 v8, 0x100 +; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v8, vcc +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, v8, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v8, s[6:7] +; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v9 ; GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 ; GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[0:1] ; GISEL-NEXT: v_rsq_f64_e32 v[8:9], v[2:3] @@ -1870,5 +1870,3 @@ attributes #1 = { convergent nounwind willreturn memory(none) } attributes #2 = { "approx-func-fp-math"="true" } attributes #3 = { "approx-func-fp-math"="true" "no-nans-fp-math"="true" "no-infs-fp-math"="true" } attributes #4 = { "unsafe-fp-math"="true" } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll index 08e06d4dd015a..d85778bc0195f 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll @@ -53,16 +53,16 @@ entry: define amdgpu_kernel void @atomic_add_i32_max_neg_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_add_i32_max_neg_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_mov_b32_e32 v0, 0xfffff000 ; SI-NEXT: v_mov_b32_e32 v1, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: v_mov_b32_e32 v2, s4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_atomic_add v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: s_endpgm @@ -5403,12 +5403,12 @@ define amdgpu_kernel void @atomic_load_i32_negoffset(ptr addrspace(1) %in, ptr a ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: v_mov_b32_e32 v0, 0xfffffe00 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s2 ; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: v_mov_b32_e32 v0, 0xfffffe00 ; SI-NEXT: v_mov_b32_e32 v1, -1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc @@ -6148,12 +6148,12 @@ define amdgpu_kernel void @atomic_load_i8_negoffset(ptr addrspace(1) %in, ptr ad ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: v_mov_b32_e32 v0, 0xfffffe00 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s2 ; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: v_mov_b32_e32 v0, 0xfffffe00 ; SI-NEXT: v_mov_b32_e32 v1, -1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 glc @@ -6339,12 +6339,12 @@ define amdgpu_kernel void @atomic_load_i16_negoffset(ptr addrspace(1) %in, ptr a ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: v_mov_b32_e32 v0, 0xfffffe00 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s2 ; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: v_mov_b32_e32 v0, 0xfffffe00 ; SI-NEXT: v_mov_b32_e32 v1, -1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 glc @@ -6606,16 +6606,16 @@ entry: define amdgpu_kernel void @atomic_inc_i32_max_neg_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_inc_i32_max_neg_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_mov_b32_e32 v0, 0xfffff000 ; SI-NEXT: v_mov_b32_e32 v1, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: v_mov_b32_e32 v2, s4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: s_endpgm @@ -7001,16 +7001,16 @@ entry: define amdgpu_kernel void @atomic_dec_i32_max_neg_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_dec_i32_max_neg_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_mov_b32_e32 v0, 0xfffff000 ; SI-NEXT: v_mov_b32_e32 v1, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: v_mov_b32_e32 v2, s4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index 429bdd805ec5e..4cbd5e84871cc 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -1096,8 +1096,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX7LESS-NEXT: s_cbranch_execz .LBB2_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] -; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -1141,27 +1141,28 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB2_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX9-NEXT: s_mov_b32 s3, 0x43300000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-NEXT: v_add_f64 v[1:2], s[2:3], v[0:1] +; GFX9-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NEXT: v_cvt_f32_f64_e32 v1, v[1:2] +; GFX9-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mul_f32_e32 v3, 4.0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_add_f32_e32 v1, v2, v3 -; GFX9-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB2_2 ; GFX9-NEXT: .LBB2_3: @@ -1169,25 +1170,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s10, -1 ; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB2_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-NEXT: s_mov_b32 s3, 0xc3300000 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -1226,9 +1225,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 ; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1032-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -1255,8 +1253,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b32 s2, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:12 ; GFX1164-NEXT: scratch_store_b32 off, v1, off offset:8 @@ -1266,10 +1263,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1164-NEXT: s_cbranch_execz .LBB2_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_mov_b32 s3, 0xc3300000 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -1309,10 +1305,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB2_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_mov_b32 s3, 0xc3300000 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 @@ -1350,27 +1345,28 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-DPP-NEXT: v_add_f64 v[1:2], s[2:3], v[0:1] +; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v1, v[1:2] +; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DPP-NEXT: v_mul_f32_e32 v3, 4.0, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_add_f32_e32 v1, v2, v3 -; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_2 ; GFX9-DPP-NEXT: .LBB2_3: @@ -1378,25 +1374,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-DPP-NEXT: s_mov_b32 s3, 0xc3300000 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -1435,9 +1429,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 ; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1032-DPP-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -1464,8 +1457,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:12 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off offset:8 @@ -1475,10 +1467,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_mov_b32 s3, 0xc3300000 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -1518,10 +1509,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_mov_b32 s3, 0xc3300000 -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 @@ -2356,8 +2346,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: s_cbranch_execz .LBB4_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] -; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -2401,27 +2391,28 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB4_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX9-NEXT: s_mov_b32 s3, 0x43300000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-NEXT: v_add_f64 v[1:2], s[2:3], v[0:1] +; GFX9-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NEXT: v_cvt_f32_f64_e32 v1, v[1:2] +; GFX9-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mul_f32_e32 v3, 4.0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_add_f32_e32 v1, v2, v3 -; GFX9-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB4_2 ; GFX9-NEXT: .LBB4_3: @@ -2429,25 +2420,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s10, -1 ; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB4_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-NEXT: s_mov_b32 s3, 0xc3300000 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -2486,9 +2475,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 ; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1032-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -2515,8 +2503,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b32 s2, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:12 ; GFX1164-NEXT: scratch_store_b32 off, v1, off offset:8 @@ -2526,10 +2513,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1164-NEXT: s_cbranch_execz .LBB4_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_mov_b32 s3, 0xc3300000 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -2569,10 +2555,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB4_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_mov_b32 s3, 0xc3300000 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 @@ -2610,27 +2595,28 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-DPP-NEXT: v_add_f64 v[1:2], s[2:3], v[0:1] +; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v1, v[1:2] +; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DPP-NEXT: v_mul_f32_e32 v3, 4.0, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_add_f32_e32 v1, v2, v3 -; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2 ; GFX9-DPP-NEXT: .LBB4_3: @@ -2638,25 +2624,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-DPP-NEXT: s_mov_b32 s3, 0xc3300000 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -2695,9 +2679,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 ; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1032-DPP-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -2724,8 +2707,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:12 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off offset:8 @@ -2735,10 +2717,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_mov_b32 s3, 0xc3300000 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -2778,10 +2759,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_mov_b32 s3, 0xc3300000 -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 @@ -4320,8 +4300,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop ; GFX7LESS-NEXT: s_cbranch_execz .LBB7_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] -; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -4365,27 +4345,28 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB7_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX9-NEXT: s_mov_b32 s3, 0x43300000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-NEXT: v_add_f64 v[1:2], s[2:3], v[0:1] +; GFX9-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NEXT: v_cvt_f32_f64_e32 v1, v[1:2] +; GFX9-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mul_f32_e32 v3, 4.0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_add_f32_e32 v1, v2, v3 -; GFX9-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB7_2 ; GFX9-NEXT: .LBB7_3: @@ -4393,25 +4374,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop ; ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s10, -1 ; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB7_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-NEXT: s_mov_b32 s3, 0xc3300000 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -4450,9 +4429,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 ; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1032-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -4479,8 +4457,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b32 s2, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:12 ; GFX1164-NEXT: scratch_store_b32 off, v1, off offset:8 @@ -4490,10 +4467,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1164-NEXT: s_cbranch_execz .LBB7_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_mov_b32 s3, 0xc3300000 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -4533,10 +4509,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB7_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_mov_b32 s3, 0xc3300000 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 @@ -4574,27 +4549,28 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-DPP-NEXT: v_add_f64 v[1:2], s[2:3], v[0:1] +; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v1, v[1:2] +; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DPP-NEXT: v_mul_f32_e32 v3, 4.0, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_add_f32_e32 v1, v2, v3 -; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX9-DPP-NEXT: .LBB7_3: @@ -4602,25 +4578,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-DPP-NEXT: s_mov_b32 s3, 0xc3300000 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -4659,9 +4633,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 ; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1032-DPP-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -4688,8 +4661,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:12 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off offset:8 @@ -4699,10 +4671,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_mov_b32 s3, 0xc3300000 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -4742,10 +4713,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_mov_b32 s3, 0xc3300000 -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll index f05a420a1b0a2..4a00d7bc71bca 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -1192,8 +1192,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX7LESS-NEXT: s_cbranch_execz .LBB2_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] -; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -1237,27 +1237,28 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB2_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX9-NEXT: s_mov_b32 s3, 0x43300000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-NEXT: v_add_f64 v[1:2], s[2:3], v[0:1] +; GFX9-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NEXT: v_cvt_f32_f64_e32 v1, v[1:2] +; GFX9-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mul_f32_e32 v3, 4.0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_sub_f32_e32 v1, v2, v3 -; GFX9-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB2_2 ; GFX9-NEXT: .LBB2_3: @@ -1265,25 +1266,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; ; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s10, -1 ; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB2_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-NEXT: s_mov_b32 s3, 0xc3300000 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -1322,9 +1321,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 ; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1032-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -1351,8 +1349,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b32 s2, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:12 ; GFX1164-NEXT: scratch_store_b32 off, v1, off offset:8 @@ -1362,10 +1359,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1164-NEXT: s_cbranch_execz .LBB2_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_mov_b32 s3, 0xc3300000 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -1405,10 +1401,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB2_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_mov_b32 s3, 0xc3300000 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 @@ -1446,27 +1441,28 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-DPP-NEXT: v_add_f64 v[1:2], s[2:3], v[0:1] +; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v1, v[1:2] +; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DPP-NEXT: v_mul_f32_e32 v3, 4.0, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_sub_f32_e32 v1, v2, v3 -; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GFX9-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_2 ; GFX9-DPP-NEXT: .LBB2_3: @@ -1474,25 +1470,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-DPP-NEXT: s_mov_b32 s3, 0xc3300000 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -1531,9 +1525,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 ; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1032-DPP-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -1560,8 +1553,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:12 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off offset:8 @@ -1571,10 +1563,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_mov_b32 s3, 0xc3300000 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -1614,10 +1605,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_mov_b32 s3, 0xc3300000 -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 @@ -2452,8 +2442,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: s_cbranch_execz .LBB4_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] -; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -2497,27 +2487,28 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB4_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX9-NEXT: s_mov_b32 s3, 0x43300000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-NEXT: v_add_f64 v[1:2], s[2:3], v[0:1] +; GFX9-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NEXT: v_cvt_f32_f64_e32 v1, v[1:2] +; GFX9-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mul_f32_e32 v3, 4.0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_sub_f32_e32 v1, v2, v3 -; GFX9-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB4_2 ; GFX9-NEXT: .LBB4_3: @@ -2525,25 +2516,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s10, -1 ; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB4_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-NEXT: s_mov_b32 s3, 0xc3300000 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -2582,9 +2571,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 ; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1032-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -2611,8 +2599,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b32 s2, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:12 ; GFX1164-NEXT: scratch_store_b32 off, v1, off offset:8 @@ -2622,10 +2609,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1164-NEXT: s_cbranch_execz .LBB4_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_mov_b32 s3, 0xc3300000 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -2665,10 +2651,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB4_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_mov_b32 s3, 0xc3300000 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 @@ -2706,27 +2691,28 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-DPP-NEXT: v_add_f64 v[1:2], s[2:3], v[0:1] +; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v1, v[1:2] +; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DPP-NEXT: v_mul_f32_e32 v3, 4.0, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_sub_f32_e32 v1, v2, v3 -; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GFX9-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2 ; GFX9-DPP-NEXT: .LBB4_3: @@ -2734,25 +2720,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-DPP-NEXT: s_mov_b32 s3, 0xc3300000 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -2791,9 +2775,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 ; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1032-DPP-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -2820,8 +2803,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:12 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off offset:8 @@ -2831,10 +2813,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_mov_b32 s3, 0xc3300000 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -2874,10 +2855,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_mov_b32 s3, 0xc3300000 -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 @@ -4504,8 +4484,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop ; GFX7LESS-NEXT: s_cbranch_execz .LBB7_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] -; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -4549,27 +4529,28 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB7_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX9-NEXT: s_mov_b32 s3, 0x43300000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-NEXT: v_add_f64 v[1:2], s[2:3], v[0:1] +; GFX9-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NEXT: v_cvt_f32_f64_e32 v1, v[1:2] +; GFX9-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mul_f32_e32 v3, 4.0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_sub_f32_e32 v1, v2, v3 -; GFX9-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB7_2 ; GFX9-NEXT: .LBB7_3: @@ -4577,25 +4558,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop ; ; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s10, -1 ; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB7_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-NEXT: s_mov_b32 s3, 0xc3300000 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -4634,9 +4613,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 ; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1032-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -4663,8 +4641,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b32 s2, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:12 ; GFX1164-NEXT: scratch_store_b32 off, v1, off offset:8 @@ -4674,10 +4651,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1164-NEXT: s_cbranch_execz .LBB7_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_mov_b32 s3, 0xc3300000 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -4717,10 +4693,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB7_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_mov_b32 s3, 0xc3300000 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 @@ -4758,27 +4733,28 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-DPP-NEXT: v_add_f64 v[1:2], s[2:3], v[0:1] +; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v1, v[1:2] +; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DPP-NEXT: v_mul_f32_e32 v3, 4.0, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_sub_f32_e32 v1, v2, v3 -; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GFX9-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX9-DPP-NEXT: .LBB7_3: @@ -4786,25 +4762,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop ; ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-DPP-NEXT: s_mov_b32 s3, 0xc3300000 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -4843,9 +4817,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 ; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1032-DPP-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -4872,8 +4845,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:12 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off offset:8 @@ -4883,10 +4855,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_mov_b32 s3, 0xc3300000 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -4926,10 +4897,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_mov_b32 s3, 0xc3300000 -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.ll index 173e4f656c90d..a290dd5fd145c 100644 --- a/llvm/test/CodeGen/AMDGPU/inline-asm.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-asm.ll @@ -205,9 +205,8 @@ entry: ; FIXME: Should not have intermediate sgprs ; CHECK-LABEL: {{^}}i64_imm_input_phys_vgpr: -; CHECK: s_mov_b64 s[0:1], 0x1e240 -; CHECK: v_mov_b32_e32 v0, s0 -; CHECK: v_mov_b32_e32 v1, s1 +; CHECK: v_mov_b32_e32 v0, 0x1e240 +; CHECK: v_mov_b32_e32 v1, 0 ; CHECK: use v[0:1] define amdgpu_kernel void @i64_imm_input_phys_vgpr() { entry: diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll index f5d41b246b1b8..220ea962b9e1d 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll @@ -85,17 +85,18 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: s_addc_u32 s1, s1, f0@gotpcrel32@hi+12 ; GFX11-NEXT: s_mov_b32 s13, s14 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s21, s14 +; GFX11-NEXT: s_mov_b32 s3, s14 ; GFX11-NEXT: s_mov_b32 s14, s15 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_mov_b32 s14, s21 +; GFX11-NEXT: s_mov_b32 s14, s3 ; GFX11-NEXT: s_mov_b32 s1, -1 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 -; GFX11-NEXT: s_cbranch_vccz .LBB2_4 +; GFX11-NEXT: s_cbranch_execz .LBB2_4 ; GFX11-NEXT: s_branch .LBB2_12 ; GFX11-NEXT: .LBB2_3: ; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB2_12 ; GFX11-NEXT: .LBB2_4: ; %bb16 ; GFX11-NEXT: s_load_b32 s2, s[16:17], 0x54 ; GFX11-NEXT: s_bitcmp1_b32 s23, 0 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index ae470efc92fee..3bc503e3714fe 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -289,16 +289,15 @@ entry: define amdgpu_kernel void @half4_inselt(ptr addrspace(1) %out, <4 x half> %vec, i32 %sel) { ; GCN-LABEL: half4_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s7, s[0:1], 0x34 +; GCN-NEXT: s_load_dword s6, s[0:1], 0x34 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GCN-NEXT: s_mov_b64 s[4:5], 0xffff -; GCN-NEXT: s_mov_b32 s6, 0x3c003c00 +; GCN-NEXT: s_mov_b32 s4, 0x3c003c00 +; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s7, s7, 4 -; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], s7 -; GCN-NEXT: s_mov_b32 s7, s6 -; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GCN-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; GCN-NEXT: s_lshl_b32 s6, s6, 4 +; GCN-NEXT: s_lshl_b64 s[6:7], 0xffff, s6 +; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; GCN-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v2, s2 @@ -419,16 +418,15 @@ entry: define amdgpu_kernel void @short4_inselt(ptr addrspace(1) %out, <4 x i16> %vec, i32 %sel) { ; GCN-LABEL: short4_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s7, s[0:1], 0x34 +; GCN-NEXT: s_load_dword s6, s[0:1], 0x34 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GCN-NEXT: s_mov_b64 s[4:5], 0xffff -; GCN-NEXT: s_mov_b32 s6, 0x10001 +; GCN-NEXT: s_mov_b32 s4, 0x10001 +; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s7, s7, 4 -; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], s7 -; GCN-NEXT: s_mov_b32 s7, s6 -; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GCN-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; GCN-NEXT: s_lshl_b32 s6, s6, 4 +; GCN-NEXT: s_lshl_b64 s[6:7], 0xffff, s6 +; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; GCN-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v2, s2 @@ -445,12 +443,11 @@ entry: define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i32 %sel) { ; GCN-LABEL: byte8_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s6, s[0:1], 0x34 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x34 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GCN-NEXT: s_mov_b64 s[4:5], 0xff ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s6, s6, 3 -; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], s6 +; GCN-NEXT: s_lshl_b32 s4, s4, 3 +; GCN-NEXT: s_lshl_b64 s[4:5], 0xff, s4 ; GCN-NEXT: s_and_b32 s7, s5, 0x1010101 ; GCN-NEXT: s_and_b32 s6, s4, 0x1010101 ; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll index c58dbd6bd1206..68427e8937bb9 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -1550,10 +1550,9 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_lshl_b32 s0, s8, 4 ; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_lshl_b32 s8, s8, 4 -; SI-NEXT: s_mov_b64 s[0:1], 0xffff -; SI-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 +; SI-NEXT: s_lshl_b64 s[0:1], 0xffff, s0 ; SI-NEXT: s_and_b32 s9, s1, 0x50005 ; SI-NEXT: s_and_b32 s8, s0, 0x50005 ; SI-NEXT: s_andn2_b64 s[0:1], s[2:3], s[0:1] @@ -1572,11 +1571,10 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_lshl_b32 s8, s8, 4 -; VI-NEXT: s_mov_b64 s[0:1], 0xffff -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 +; VI-NEXT: s_lshl_b32 s0, s8, 4 ; VI-NEXT: s_mov_b32 s8, 0x50005 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_lshl_b64 s[0:1], 0xffff, s0 ; VI-NEXT: s_mov_b32 s9, s8 ; VI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] ; VI-NEXT: s_and_b64 s[0:1], s[0:1], s[8:9] @@ -1725,17 +1723,16 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(ptr addrspace(1) %out, p ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_lshl_b32 s0, s8, 3 ; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SI-NEXT: s_lshl_b32 s8, s8, 3 -; SI-NEXT: s_mov_b64 s[2:3], 0xff -; SI-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 -; SI-NEXT: s_and_b32 s9, s3, 0x5050505 -; SI-NEXT: s_and_b32 s8, s2, 0x5050505 +; SI-NEXT: s_lshl_b64 s[0:1], 0xff, s0 +; SI-NEXT: s_and_b32 s9, s1, 0x5050505 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; SI-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] +; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] +; SI-NEXT: s_and_b32 s8, s0, 0x5050505 +; SI-NEXT: s_or_b64 s[0:1], s[8:9], s[2:3] ; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -1748,17 +1745,16 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(ptr addrspace(1) %out, p ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_lshl_b32 s0, s8, 3 ; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; VI-NEXT: s_lshl_b32 s8, s8, 3 -; VI-NEXT: s_mov_b64 s[2:3], 0xff -; VI-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 -; VI-NEXT: s_and_b32 s9, s3, 0x5050505 -; VI-NEXT: s_and_b32 s8, s2, 0x5050505 +; VI-NEXT: s_lshl_b64 s[0:1], 0xff, s0 +; VI-NEXT: s_and_b32 s9, s1, 0x5050505 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; VI-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] +; VI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] +; VI-NEXT: s_and_b32 s8, s0, 0x5050505 +; VI-NEXT: s_or_b64 s[0:1], s[8:9], s[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index f98b41ba199bd..47f7943e076a4 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -2064,19 +2064,17 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) % ; ; GFX11-LABEL: v_insertelement_v4i16_dynamic_vgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x10 ; GFX11-NEXT: global_load_b32 v2, v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; GFX11-NEXT: s_mov_b64 s[0:1], 0xffff +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7] +; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b64 v[2:3], v2, s[0:1] -; GFX11-NEXT: s_pack_ll_b32_b16 s0, s2, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b64 v[2:3], v2, 0xffff ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_bfi_b32 v1, v3, s0, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -2106,12 +2104,11 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) % ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] -; GFX9-NEXT: s_mov_b64 s[2:3], 0xffff -; GFX9-NEXT: s_lshl_b32 s4, s7, 4 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s6, s6 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-NEXT: s_lshl_b32 s2, s7, 4 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s6 +; GFX9-NEXT: s_lshl_b64 s[2:3], 0xffff, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v1, s3, v3, v1 ; GFX9-NEXT: v_bfi_b32 v0, s2, v4, v0 @@ -2128,14 +2125,13 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) % ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_mov_b64 s[2:3], 0xffff ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: s_lshl_b32 s1, s5, 4 -; VI-NEXT: s_lshl_b32 s5, s4, 16 -; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s1, s4, 16 +; VI-NEXT: s_and_b32 s2, s4, 0xffff +; VI-NEXT: s_lshl_b32 s3, s5, 4 +; VI-NEXT: s_or_b32 s2, s2, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], s1 -; VI-NEXT: s_or_b32 s2, s4, s5 +; VI-NEXT: s_lshl_b64 s[0:1], 0xffff, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: v_mov_b32_e32 v5, s2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -2155,14 +2151,13 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) % ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; CI-NEXT: s_mov_b64 s[2:3], 0xffff ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: s_and_b32 s6, s4, 0xffff -; CI-NEXT: s_lshl_b32 s1, s5, 4 -; CI-NEXT: s_lshl_b32 s4, s4, 16 +; CI-NEXT: s_and_b32 s1, s4, 0xffff +; CI-NEXT: s_lshl_b32 s2, s4, 16 +; CI-NEXT: s_lshl_b32 s3, s5, 4 +; CI-NEXT: s_or_b32 s2, s1, s2 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 -; CI-NEXT: s_lshl_b64 s[0:1], s[2:3], s1 -; CI-NEXT: s_or_b32 s2, s6, s4 +; CI-NEXT: s_lshl_b64 s[0:1], 0xffff, s3 ; CI-NEXT: v_mov_b32_e32 v4, s2 ; CI-NEXT: v_mov_b32_e32 v5, s2 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -2177,15 +2172,14 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) % ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 -; GFX11-NEXT: s_mov_b64 s[2:3], 0xffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] ; GFX11-NEXT: s_lshl_b32 s1, s1, 4 -; GFX11-NEXT: s_pack_ll_b32_b16 s6, s0, s0 -; GFX11-NEXT: s_lshl_b64 s[0:1], s[2:3], s1 +; GFX11-NEXT: s_pack_ll_b32_b16 s2, s0, s0 +; GFX11-NEXT: s_lshl_b64 s[0:1], 0xffff, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_bfi_b32 v1, s1, s6, v1 -; GFX11-NEXT: v_bfi_b32 v0, s0, s6, v0 +; GFX11-NEXT: v_bfi_b32 v1, s1, s2, v1 +; GFX11-NEXT: v_bfi_b32 v0, s0, s2, v0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/ipra-return-address-save-restore.ll b/llvm/test/CodeGen/AMDGPU/ipra-return-address-save-restore.ll index 741164bc04506..29a96c227f2f0 100644 --- a/llvm/test/CodeGen/AMDGPU/ipra-return-address-save-restore.ll +++ b/llvm/test/CodeGen/AMDGPU/ipra-return-address-save-restore.ll @@ -29,7 +29,7 @@ declare void @llvm.lifetime.end.p5(i64 immarg, ptr addrspace(5) nocapture) #1 define internal fastcc void @svm_node_closure_bsdf(ptr addrspace(1) %sd, ptr %stack, <4 x i32> %node, ptr %offset, i32 %0, i8 %trunc, float %1, float %2, float %mul80, i1 %cmp412.old, <4 x i32> %3, float %4, i32 %5, i1 %cmp440, i1 %cmp442, i1 %or.cond1306, float %.op, ptr addrspace(1) %arrayidx.i.i2202, ptr addrspace(1) %retval.0.i.i22089, ptr addrspace(1) %retval.1.i221310, i1 %cmp575, ptr addrspace(1) %num_closure_left.i2215, i32 %6, i1 %cmp.i2216, i32 %7, i64 %idx.ext.i2223, i32 %sub5.i2221) #2 { ; GCN-LABEL: {{^}}svm_node_closure_bsdf: ; GCN-NOT: v_writelane_b32 -; GCN: s_movk_i32 s28, 0x60 +; GCN: s_movk_i32 s26, 0x60 ; GCN-NOT: s31 ; GCN-NOT: v_readlane_b32 ; GCN: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll b/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll index dee5b724934a0..31295f2a543f2 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll @@ -674,98 +674,91 @@ define amdgpu_kernel void @lds_ds_fmax(ptr addrspace(5) %out, ptr addrspace(3) % define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace(3) %ptrf, i32 %idx) { ; SI-LABEL: lds_ds_fmin_f64: ; SI: ; %bb.0: -; SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; SI-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s11, 0xe8f000 -; SI-NEXT: s_add_u32 s8, s8, s3 -; SI-NEXT: s_addc_u32 s9, s9, 0 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s7, 0xe8f000 +; SI-NEXT: s_add_u32 s4, s4, s3 +; SI-NEXT: s_addc_u32 s5, s5, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s5, s4, 4 -; SI-NEXT: s_lshl_b32 s4, s4, 3 -; SI-NEXT: s_mov_b32 s3, 0x40450000 -; SI-NEXT: s_add_i32 s4, s4, 32 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: s_lshl_b32 s3, s2, 4 +; SI-NEXT: s_lshl_b32 s2, s2, 3 +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: s_add_i32 s2, s2, 32 +; SI-NEXT: v_mov_b32_e32 v1, 0x40450000 +; SI-NEXT: v_mov_b32_e32 v2, s2 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] -; SI-NEXT: s_add_i32 s2, s5, 64 -; SI-NEXT: v_mov_b32_e32 v4, s2 -; SI-NEXT: ds_min_f64 v4, v[0:1] -; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: v_mov_b32_e32 v4, s1 +; SI-NEXT: s_add_i32 s1, s3, 64 +; SI-NEXT: v_mov_b32_e32 v5, s1 +; SI-NEXT: ds_min_f64 v5, v[0:1] ; SI-NEXT: s_waitcnt lgkmcnt(1) -; SI-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3] +; SI-NEXT: ds_min_rtn_f64 v[0:1], v4, v[2:3] ; SI-NEXT: s_add_i32 s1, s0, 4 -; SI-NEXT: v_mov_b32_e32 v2, s1 +; SI-NEXT: v_mov_b32_e32 v3, s1 +; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v1, v2, s[8:11], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen +; SI-NEXT: buffer_store_dword v1, v3, s[4:7], 0 offen +; SI-NEXT: buffer_store_dword v0, v2, s[4:7], 0 offen ; SI-NEXT: s_endpgm ; ; GFX7-LABEL: lds_ds_fmin_f64: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX7-NEXT: s_mov_b32 s10, -1 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX7-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; GFX7-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_load_dword s2, s[0:1], 0xb ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7-NEXT: s_mov_b32 s11, 0xe8f000 -; GFX7-NEXT: s_add_u32 s8, s8, s3 -; GFX7-NEXT: s_mov_b32 s2, 0 -; GFX7-NEXT: s_mov_b32 s3, 0x40450000 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: s_addc_u32 s9, s9, 0 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xe8f000 +; GFX7-NEXT: s_add_u32 s4, s4, s3 +; GFX7-NEXT: s_addc_u32 s5, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshl_b32 s2, s4, 3 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_lshl_b32 s3, s2, 3 +; GFX7-NEXT: v_mov_b32_e32 v1, 0x40450000 +; GFX7-NEXT: v_mov_b32_e32 v2, s3 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32 -; GFX7-NEXT: s_lshl_b32 s2, s4, 4 -; GFX7-NEXT: v_mov_b32_e32 v4, s2 -; GFX7-NEXT: ds_min_f64 v4, v[0:1] offset:64 -; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: s_lshl_b32 s2, s2, 4 +; GFX7-NEXT: v_mov_b32_e32 v5, s2 +; GFX7-NEXT: v_mov_b32_e32 v4, s1 +; GFX7-NEXT: ds_min_f64 v5, v[0:1] offset:64 ; GFX7-NEXT: s_waitcnt lgkmcnt(1) -; GFX7-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3] +; GFX7-NEXT: ds_min_rtn_f64 v[0:1], v4, v[2:3] ; GFX7-NEXT: s_add_i32 s1, s0, 4 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_store_dword v1, v3, s[8:11], 0 offen -; GFX7-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v3, s[4:7], 0 offen +; GFX7-NEXT: buffer_store_dword v0, v2, s[4:7], 0 offen ; GFX7-NEXT: s_endpgm ; ; VI-LABEL: lds_ds_fmin_f64: ; VI: ; %bb.0: ; VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; VI-NEXT: s_mov_b32 s90, -1 -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s90, -1 ; VI-NEXT: s_mov_b32 s91, 0xe80000 ; VI-NEXT: s_add_u32 s88, s88, s3 -; VI-NEXT: s_mov_b32 s2, 0 -; VI-NEXT: s_mov_b32 s3, 0x40450000 -; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_addc_u32 s89, s89, 0 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s2, s4, 3 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_lshl_b32 s3, s2, 3 +; VI-NEXT: v_mov_b32_e32 v1, 0x40450000 +; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32 -; VI-NEXT: s_lshl_b32 s2, s4, 4 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: ds_min_f64 v4, v[0:1] offset:64 -; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: s_lshl_b32 s2, s2, 4 +; VI-NEXT: v_mov_b32_e32 v5, s2 +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: ds_min_f64 v5, v[0:1] offset:64 ; VI-NEXT: s_waitcnt lgkmcnt(1) -; VI-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3] +; VI-NEXT: ds_min_rtn_f64 v[0:1], v4, v[2:3] ; VI-NEXT: s_add_i32 s1, s0, 4 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -783,11 +776,9 @@ define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace( ; GFX9-NEXT: s_add_u32 s8, s8, s3 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: s_mov_b32 s1, 0x40450000 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x40450000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s0, s4, 3 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 @@ -814,17 +805,15 @@ define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace( ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: s_mov_b32 s0, 0 ; GFX10-NEXT: s_addc_u32 s9, s9, 0 -; GFX10-NEXT: s_mov_b32 s1, 0x40450000 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x40450000 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshl_b32 s5, s4, 3 +; GFX10-NEXT: s_lshl_b32 s0, s4, 3 +; GFX10-NEXT: v_mov_b32_e32 v5, s3 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: s_lshl_b32 s0, s4, 4 -; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: v_mov_b32_e32 v4, s0 -; GFX10-NEXT: v_mov_b32_e32 v5, s3 ; GFX10-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32 ; GFX10-NEXT: ds_min_f64 v4, v[0:1] offset:64 ; GFX10-NEXT: s_waitcnt lgkmcnt(1) @@ -838,16 +827,14 @@ define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace( ; GFX11-LABEL: lds_ds_fmin_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_mov_b32 s3, 0x40450000 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshl_b32 s5, s4, 3 -; GFX11-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v2, s5 -; GFX11-NEXT: s_lshl_b32 s2, s4, 4 +; GFX11-NEXT: s_lshl_b32 s3, s2, 3 +; GFX11-NEXT: v_mov_b32_e32 v5, s1 +; GFX11-NEXT: v_dual_mov_b32 v1, 0x40450000 :: v_dual_mov_b32 v2, s3 +; GFX11-NEXT: s_lshl_b32 s2, s2, 4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v4, s2 ; GFX11-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32 @@ -870,11 +857,11 @@ define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace( ; G_SI-NEXT: s_mov_b32 s2, 0 ; G_SI-NEXT: s_addc_u32 s9, s9, 0 ; G_SI-NEXT: s_mov_b32 s3, 0x40450000 -; G_SI-NEXT: v_mov_b32_e32 v0, s2 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: s_add_i32 s4, s4, 4 -; G_SI-NEXT: v_mov_b32_e32 v1, s3 +; G_SI-NEXT: v_mov_b32_e32 v0, s2 ; G_SI-NEXT: s_lshl_b32 s2, s4, 3 +; G_SI-NEXT: v_mov_b32_e32 v1, s3 ; G_SI-NEXT: v_mov_b32_e32 v2, s2 ; G_SI-NEXT: s_mov_b32 m0, -1 ; G_SI-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] @@ -904,11 +891,11 @@ define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace( ; G_GFX7-NEXT: s_mov_b32 s2, 0 ; G_GFX7-NEXT: s_addc_u32 s9, s9, 0 ; G_GFX7-NEXT: s_mov_b32 s3, 0x40450000 -; G_GFX7-NEXT: v_mov_b32_e32 v0, s2 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: s_add_i32 s4, s4, 4 -; G_GFX7-NEXT: v_mov_b32_e32 v1, s3 +; G_GFX7-NEXT: v_mov_b32_e32 v0, s2 ; G_GFX7-NEXT: s_lshl_b32 s2, s4, 3 +; G_GFX7-NEXT: v_mov_b32_e32 v1, s3 ; G_GFX7-NEXT: v_mov_b32_e32 v2, s2 ; G_GFX7-NEXT: s_mov_b32 m0, -1 ; G_GFX7-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] @@ -938,11 +925,11 @@ define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace( ; G_VI-NEXT: s_mov_b32 s2, 0 ; G_VI-NEXT: s_addc_u32 s89, s89, 0 ; G_VI-NEXT: s_mov_b32 s3, 0x40450000 -; G_VI-NEXT: v_mov_b32_e32 v0, s2 ; G_VI-NEXT: s_waitcnt lgkmcnt(0) ; G_VI-NEXT: s_add_i32 s4, s4, 4 -; G_VI-NEXT: v_mov_b32_e32 v1, s3 +; G_VI-NEXT: v_mov_b32_e32 v0, s2 ; G_VI-NEXT: s_lshl_b32 s2, s4, 3 +; G_VI-NEXT: v_mov_b32_e32 v1, s3 ; G_VI-NEXT: v_mov_b32_e32 v2, s2 ; G_VI-NEXT: s_mov_b32 m0, -1 ; G_VI-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] @@ -972,11 +959,11 @@ define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace( ; G_GFX9-NEXT: s_mov_b32 s0, 0 ; G_GFX9-NEXT: s_addc_u32 s9, s9, 0 ; G_GFX9-NEXT: s_mov_b32 s1, 0x40450000 -; G_GFX9-NEXT: v_mov_b32_e32 v0, s0 ; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX9-NEXT: s_add_i32 s4, s4, 4 -; G_GFX9-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX9-NEXT: v_mov_b32_e32 v0, s0 ; G_GFX9-NEXT: s_lshl_b32 s0, s4, 3 +; G_GFX9-NEXT: v_mov_b32_e32 v1, s1 ; G_GFX9-NEXT: v_mov_b32_e32 v2, s0 ; G_GFX9-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] ; G_GFX9-NEXT: s_lshl_b32 s0, s4, 4 @@ -1031,10 +1018,10 @@ define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace( ; G_GFX11-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX11-NEXT: s_add_i32 s4, s2, 4 ; G_GFX11-NEXT: s_mov_b32 s2, 0 -; G_GFX11-NEXT: s_lshl_b32 s5, s4, 3 ; G_GFX11-NEXT: s_mov_b32 s3, 0x40450000 +; G_GFX11-NEXT: s_lshl_b32 s5, s4, 3 ; G_GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v5, s1 -; G_GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v1, s3 +; G_GFX11-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v2, s5 ; G_GFX11-NEXT: s_lshl_b32 s2, s4, 4 ; G_GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; G_GFX11-NEXT: v_mov_b32_e32 v4, s2 @@ -1060,98 +1047,91 @@ define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace( define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace(3) %ptrf, i32 %idx) { ; SI-LABEL: lds_ds_fmax_f64: ; SI: ; %bb.0: -; SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; SI-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s11, 0xe8f000 -; SI-NEXT: s_add_u32 s8, s8, s3 -; SI-NEXT: s_addc_u32 s9, s9, 0 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s7, 0xe8f000 +; SI-NEXT: s_add_u32 s4, s4, s3 +; SI-NEXT: s_addc_u32 s5, s5, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s5, s4, 4 -; SI-NEXT: s_lshl_b32 s4, s4, 3 -; SI-NEXT: s_mov_b32 s3, 0x40450000 -; SI-NEXT: s_add_i32 s4, s4, 32 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: s_lshl_b32 s3, s2, 4 +; SI-NEXT: s_lshl_b32 s2, s2, 3 +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: s_add_i32 s2, s2, 32 +; SI-NEXT: v_mov_b32_e32 v1, 0x40450000 +; SI-NEXT: v_mov_b32_e32 v2, s2 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] -; SI-NEXT: s_add_i32 s2, s5, 64 -; SI-NEXT: v_mov_b32_e32 v4, s2 -; SI-NEXT: ds_max_f64 v4, v[0:1] -; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: v_mov_b32_e32 v4, s1 +; SI-NEXT: s_add_i32 s1, s3, 64 +; SI-NEXT: v_mov_b32_e32 v5, s1 +; SI-NEXT: ds_max_f64 v5, v[0:1] ; SI-NEXT: s_waitcnt lgkmcnt(1) -; SI-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3] +; SI-NEXT: ds_max_rtn_f64 v[0:1], v4, v[2:3] ; SI-NEXT: s_add_i32 s1, s0, 4 -; SI-NEXT: v_mov_b32_e32 v2, s1 +; SI-NEXT: v_mov_b32_e32 v3, s1 +; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v1, v2, s[8:11], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen +; SI-NEXT: buffer_store_dword v1, v3, s[4:7], 0 offen +; SI-NEXT: buffer_store_dword v0, v2, s[4:7], 0 offen ; SI-NEXT: s_endpgm ; ; GFX7-LABEL: lds_ds_fmax_f64: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX7-NEXT: s_mov_b32 s10, -1 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX7-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; GFX7-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_load_dword s2, s[0:1], 0xb ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7-NEXT: s_mov_b32 s11, 0xe8f000 -; GFX7-NEXT: s_add_u32 s8, s8, s3 -; GFX7-NEXT: s_mov_b32 s2, 0 -; GFX7-NEXT: s_mov_b32 s3, 0x40450000 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: s_addc_u32 s9, s9, 0 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xe8f000 +; GFX7-NEXT: s_add_u32 s4, s4, s3 +; GFX7-NEXT: s_addc_u32 s5, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshl_b32 s2, s4, 3 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_lshl_b32 s3, s2, 3 +; GFX7-NEXT: v_mov_b32_e32 v1, 0x40450000 +; GFX7-NEXT: v_mov_b32_e32 v2, s3 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32 -; GFX7-NEXT: s_lshl_b32 s2, s4, 4 -; GFX7-NEXT: v_mov_b32_e32 v4, s2 -; GFX7-NEXT: ds_max_f64 v4, v[0:1] offset:64 -; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: s_lshl_b32 s2, s2, 4 +; GFX7-NEXT: v_mov_b32_e32 v5, s2 +; GFX7-NEXT: v_mov_b32_e32 v4, s1 +; GFX7-NEXT: ds_max_f64 v5, v[0:1] offset:64 ; GFX7-NEXT: s_waitcnt lgkmcnt(1) -; GFX7-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3] +; GFX7-NEXT: ds_max_rtn_f64 v[0:1], v4, v[2:3] ; GFX7-NEXT: s_add_i32 s1, s0, 4 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_store_dword v1, v3, s[8:11], 0 offen -; GFX7-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v3, s[4:7], 0 offen +; GFX7-NEXT: buffer_store_dword v0, v2, s[4:7], 0 offen ; GFX7-NEXT: s_endpgm ; ; VI-LABEL: lds_ds_fmax_f64: ; VI: ; %bb.0: ; VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; VI-NEXT: s_mov_b32 s90, -1 -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s90, -1 ; VI-NEXT: s_mov_b32 s91, 0xe80000 ; VI-NEXT: s_add_u32 s88, s88, s3 -; VI-NEXT: s_mov_b32 s2, 0 -; VI-NEXT: s_mov_b32 s3, 0x40450000 -; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_addc_u32 s89, s89, 0 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s2, s4, 3 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_lshl_b32 s3, s2, 3 +; VI-NEXT: v_mov_b32_e32 v1, 0x40450000 +; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32 -; VI-NEXT: s_lshl_b32 s2, s4, 4 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: ds_max_f64 v4, v[0:1] offset:64 -; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: s_lshl_b32 s2, s2, 4 +; VI-NEXT: v_mov_b32_e32 v5, s2 +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: ds_max_f64 v5, v[0:1] offset:64 ; VI-NEXT: s_waitcnt lgkmcnt(1) -; VI-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3] +; VI-NEXT: ds_max_rtn_f64 v[0:1], v4, v[2:3] ; VI-NEXT: s_add_i32 s1, s0, 4 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -1169,11 +1149,9 @@ define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace( ; GFX9-NEXT: s_add_u32 s8, s8, s3 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: s_mov_b32 s1, 0x40450000 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x40450000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s0, s4, 3 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 @@ -1200,17 +1178,15 @@ define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace( ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: s_mov_b32 s0, 0 ; GFX10-NEXT: s_addc_u32 s9, s9, 0 -; GFX10-NEXT: s_mov_b32 s1, 0x40450000 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x40450000 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshl_b32 s5, s4, 3 +; GFX10-NEXT: s_lshl_b32 s0, s4, 3 +; GFX10-NEXT: v_mov_b32_e32 v5, s3 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: s_lshl_b32 s0, s4, 4 -; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: v_mov_b32_e32 v4, s0 -; GFX10-NEXT: v_mov_b32_e32 v5, s3 ; GFX10-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32 ; GFX10-NEXT: ds_max_f64 v4, v[0:1] offset:64 ; GFX10-NEXT: s_waitcnt lgkmcnt(1) @@ -1224,16 +1200,14 @@ define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace( ; GFX11-LABEL: lds_ds_fmax_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_mov_b32 s3, 0x40450000 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshl_b32 s5, s4, 3 -; GFX11-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v2, s5 -; GFX11-NEXT: s_lshl_b32 s2, s4, 4 +; GFX11-NEXT: s_lshl_b32 s3, s2, 3 +; GFX11-NEXT: v_mov_b32_e32 v5, s1 +; GFX11-NEXT: v_dual_mov_b32 v1, 0x40450000 :: v_dual_mov_b32 v2, s3 +; GFX11-NEXT: s_lshl_b32 s2, s2, 4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v4, s2 ; GFX11-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32 @@ -1256,11 +1230,11 @@ define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace( ; G_SI-NEXT: s_mov_b32 s2, 0 ; G_SI-NEXT: s_addc_u32 s9, s9, 0 ; G_SI-NEXT: s_mov_b32 s3, 0x40450000 -; G_SI-NEXT: v_mov_b32_e32 v0, s2 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: s_add_i32 s4, s4, 4 -; G_SI-NEXT: v_mov_b32_e32 v1, s3 +; G_SI-NEXT: v_mov_b32_e32 v0, s2 ; G_SI-NEXT: s_lshl_b32 s2, s4, 3 +; G_SI-NEXT: v_mov_b32_e32 v1, s3 ; G_SI-NEXT: v_mov_b32_e32 v2, s2 ; G_SI-NEXT: s_mov_b32 m0, -1 ; G_SI-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] @@ -1290,11 +1264,11 @@ define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace( ; G_GFX7-NEXT: s_mov_b32 s2, 0 ; G_GFX7-NEXT: s_addc_u32 s9, s9, 0 ; G_GFX7-NEXT: s_mov_b32 s3, 0x40450000 -; G_GFX7-NEXT: v_mov_b32_e32 v0, s2 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: s_add_i32 s4, s4, 4 -; G_GFX7-NEXT: v_mov_b32_e32 v1, s3 +; G_GFX7-NEXT: v_mov_b32_e32 v0, s2 ; G_GFX7-NEXT: s_lshl_b32 s2, s4, 3 +; G_GFX7-NEXT: v_mov_b32_e32 v1, s3 ; G_GFX7-NEXT: v_mov_b32_e32 v2, s2 ; G_GFX7-NEXT: s_mov_b32 m0, -1 ; G_GFX7-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] @@ -1324,11 +1298,11 @@ define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace( ; G_VI-NEXT: s_mov_b32 s2, 0 ; G_VI-NEXT: s_addc_u32 s89, s89, 0 ; G_VI-NEXT: s_mov_b32 s3, 0x40450000 -; G_VI-NEXT: v_mov_b32_e32 v0, s2 ; G_VI-NEXT: s_waitcnt lgkmcnt(0) ; G_VI-NEXT: s_add_i32 s4, s4, 4 -; G_VI-NEXT: v_mov_b32_e32 v1, s3 +; G_VI-NEXT: v_mov_b32_e32 v0, s2 ; G_VI-NEXT: s_lshl_b32 s2, s4, 3 +; G_VI-NEXT: v_mov_b32_e32 v1, s3 ; G_VI-NEXT: v_mov_b32_e32 v2, s2 ; G_VI-NEXT: s_mov_b32 m0, -1 ; G_VI-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] @@ -1358,11 +1332,11 @@ define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace( ; G_GFX9-NEXT: s_mov_b32 s0, 0 ; G_GFX9-NEXT: s_addc_u32 s9, s9, 0 ; G_GFX9-NEXT: s_mov_b32 s1, 0x40450000 -; G_GFX9-NEXT: v_mov_b32_e32 v0, s0 ; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX9-NEXT: s_add_i32 s4, s4, 4 -; G_GFX9-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX9-NEXT: v_mov_b32_e32 v0, s0 ; G_GFX9-NEXT: s_lshl_b32 s0, s4, 3 +; G_GFX9-NEXT: v_mov_b32_e32 v1, s1 ; G_GFX9-NEXT: v_mov_b32_e32 v2, s0 ; G_GFX9-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] ; G_GFX9-NEXT: s_lshl_b32 s0, s4, 4 @@ -1417,10 +1391,10 @@ define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace( ; G_GFX11-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX11-NEXT: s_add_i32 s4, s2, 4 ; G_GFX11-NEXT: s_mov_b32 s2, 0 -; G_GFX11-NEXT: s_lshl_b32 s5, s4, 3 ; G_GFX11-NEXT: s_mov_b32 s3, 0x40450000 +; G_GFX11-NEXT: s_lshl_b32 s5, s4, 3 ; G_GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v5, s1 -; G_GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v1, s3 +; G_GFX11-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v2, s5 ; G_GFX11-NEXT: s_lshl_b32 s2, s4, 4 ; G_GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; G_GFX11-NEXT: v_mov_b32_e32 v4, s2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll index 5a950d803e9c5..af59d62b2e2d0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr="+wavefrontsize32" -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX11,SDAG-GFX11 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr="+wavefrontsize32" -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,SDAG-GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr="+wavefrontsize32" -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-GFX11 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr="+wavefrontsize32" -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-GFX10 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -mattr="+wavefrontsize32" -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX11,GISEL-GFX11 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -mattr="+wavefrontsize32" -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GISEL-GFX10 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -mattr="+wavefrontsize32" -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL-GFX11 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -mattr="+wavefrontsize32" -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL-GFX10 %s declare i32 @llvm.amdgcn.fcmp.f32(float, float, i32) #0 declare i32 @llvm.amdgcn.fcmp.f64(double, double, i32) #0 @@ -962,10 +962,8 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_oeq: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; SDAG-GFX11-NEXT: s_mov_b32 s4, 0 -; SDAG-GFX11-NEXT: s_mov_b32 s5, 0x40590000 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_eq_f64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX11-NEXT: v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -976,11 +974,9 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; SDAG-GFX10-LABEL: v_fcmp_f64_oeq: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; SDAG-GFX10-NEXT: s_mov_b32 s4, 0 -; SDAG-GFX10-NEXT: s_mov_b32 s5, 0x40590000 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_eq_f64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX10-NEXT: v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -988,11 +984,9 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; GISEL-GFX11-LABEL: v_fcmp_f64_oeq: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GISEL-GFX11-NEXT: s_mov_b32 s4, 0 -; GISEL-GFX11-NEXT: s_mov_b32 s5, 0x40590000 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_eq_f64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX11-NEXT: v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3] ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1003,11 +997,9 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; GISEL-GFX10-LABEL: v_fcmp_f64_oeq: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-GFX10-NEXT: s_mov_b32 s4, 0 -; GISEL-GFX10-NEXT: s_mov_b32 s5, 0x40590000 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_eq_f64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX10-NEXT: v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3] ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -1020,10 +1012,8 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_one: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; SDAG-GFX11-NEXT: s_mov_b32 s4, 0 -; SDAG-GFX11-NEXT: s_mov_b32 s5, 0x40590000 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_neq_f64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX11-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1034,11 +1024,9 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; SDAG-GFX10-LABEL: v_fcmp_f64_one: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; SDAG-GFX10-NEXT: s_mov_b32 s4, 0 -; SDAG-GFX10-NEXT: s_mov_b32 s5, 0x40590000 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_neq_f64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -1046,11 +1034,9 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; GISEL-GFX11-LABEL: v_fcmp_f64_one: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GISEL-GFX11-NEXT: s_mov_b32 s4, 0 -; GISEL-GFX11-NEXT: s_mov_b32 s5, 0x40590000 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_neq_f64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX11-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1061,11 +1047,9 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; GISEL-GFX10-LABEL: v_fcmp_f64_one: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-GFX10-NEXT: s_mov_b32 s4, 0 -; GISEL-GFX10-NEXT: s_mov_b32 s5, 0x40590000 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_neq_f64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -1078,10 +1062,8 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_ogt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; SDAG-GFX11-NEXT: s_mov_b32 s4, 0 -; SDAG-GFX11-NEXT: s_mov_b32 s5, 0x40590000 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_gt_f64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX11-NEXT: v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1092,11 +1074,9 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; SDAG-GFX10-LABEL: v_fcmp_f64_ogt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; SDAG-GFX10-NEXT: s_mov_b32 s4, 0 -; SDAG-GFX10-NEXT: s_mov_b32 s5, 0x40590000 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_f64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX10-NEXT: v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -1104,11 +1084,9 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; GISEL-GFX11-LABEL: v_fcmp_f64_ogt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GISEL-GFX11-NEXT: s_mov_b32 s4, 0 -; GISEL-GFX11-NEXT: s_mov_b32 s5, 0x40590000 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_gt_f64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX11-NEXT: v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3] ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1119,11 +1097,9 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; GISEL-GFX10-LABEL: v_fcmp_f64_ogt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-GFX10-NEXT: s_mov_b32 s4, 0 -; GISEL-GFX10-NEXT: s_mov_b32 s5, 0x40590000 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_f64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX10-NEXT: v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3] ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -1136,10 +1112,8 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_oge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; SDAG-GFX11-NEXT: s_mov_b32 s4, 0 -; SDAG-GFX11-NEXT: s_mov_b32 s5, 0x40590000 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ge_f64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX11-NEXT: v_cmp_le_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1150,11 +1124,9 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; SDAG-GFX10-LABEL: v_fcmp_f64_oge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; SDAG-GFX10-NEXT: s_mov_b32 s4, 0 -; SDAG-GFX10-NEXT: s_mov_b32 s5, 0x40590000 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_f64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX10-NEXT: v_cmp_le_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -1162,11 +1134,9 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; GISEL-GFX11-LABEL: v_fcmp_f64_oge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GISEL-GFX11-NEXT: s_mov_b32 s4, 0 -; GISEL-GFX11-NEXT: s_mov_b32 s5, 0x40590000 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ge_f64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX11-NEXT: v_cmp_le_f64_e64 s2, 0x40590000, s[2:3] ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1177,11 +1147,9 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; GISEL-GFX10-LABEL: v_fcmp_f64_oge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-GFX10-NEXT: s_mov_b32 s4, 0 -; GISEL-GFX10-NEXT: s_mov_b32 s5, 0x40590000 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_f64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX10-NEXT: v_cmp_le_f64_e64 s2, 0x40590000, s[2:3] ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -1194,10 +1162,8 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_olt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; SDAG-GFX11-NEXT: s_mov_b32 s4, 0 -; SDAG-GFX11-NEXT: s_mov_b32 s5, 0x40590000 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_lt_f64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX11-NEXT: v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1208,11 +1174,9 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; SDAG-GFX10-LABEL: v_fcmp_f64_olt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; SDAG-GFX10-NEXT: s_mov_b32 s4, 0 -; SDAG-GFX10-NEXT: s_mov_b32 s5, 0x40590000 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_f64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX10-NEXT: v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -1220,11 +1184,9 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; GISEL-GFX11-LABEL: v_fcmp_f64_olt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GISEL-GFX11-NEXT: s_mov_b32 s4, 0 -; GISEL-GFX11-NEXT: s_mov_b32 s5, 0x40590000 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_lt_f64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX11-NEXT: v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3] ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1235,11 +1197,9 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; GISEL-GFX10-LABEL: v_fcmp_f64_olt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-GFX10-NEXT: s_mov_b32 s4, 0 -; GISEL-GFX10-NEXT: s_mov_b32 s5, 0x40590000 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_f64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX10-NEXT: v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3] ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -1252,10 +1212,8 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_ole: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; SDAG-GFX11-NEXT: s_mov_b32 s4, 0 -; SDAG-GFX11-NEXT: s_mov_b32 s5, 0x40590000 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_le_f64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX11-NEXT: v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1266,11 +1224,9 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; SDAG-GFX10-LABEL: v_fcmp_f64_ole: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; SDAG-GFX10-NEXT: s_mov_b32 s4, 0 -; SDAG-GFX10-NEXT: s_mov_b32 s5, 0x40590000 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_f64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX10-NEXT: v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -1278,11 +1234,9 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; GISEL-GFX11-LABEL: v_fcmp_f64_ole: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GISEL-GFX11-NEXT: s_mov_b32 s4, 0 -; GISEL-GFX11-NEXT: s_mov_b32 s5, 0x40590000 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_le_f64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX11-NEXT: v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3] ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1293,11 +1247,9 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; GISEL-GFX10-LABEL: v_fcmp_f64_ole: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-GFX10-NEXT: s_mov_b32 s4, 0 -; GISEL-GFX10-NEXT: s_mov_b32 s5, 0x40590000 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_f64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX10-NEXT: v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3] ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -1310,10 +1262,8 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_ueq: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; SDAG-GFX11-NEXT: s_mov_b32 s4, 0 -; SDAG-GFX11-NEXT: s_mov_b32 s5, 0x40590000 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nlg_f64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX11-NEXT: v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1324,11 +1274,9 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; SDAG-GFX10-LABEL: v_fcmp_f64_ueq: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; SDAG-GFX10-NEXT: s_mov_b32 s4, 0 -; SDAG-GFX10-NEXT: s_mov_b32 s5, 0x40590000 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nlg_f64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX10-NEXT: v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -1336,11 +1284,9 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; GISEL-GFX11-LABEL: v_fcmp_f64_ueq: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GISEL-GFX11-NEXT: s_mov_b32 s4, 0 -; GISEL-GFX11-NEXT: s_mov_b32 s5, 0x40590000 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nlg_f64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX11-NEXT: v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3] ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1351,11 +1297,9 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; GISEL-GFX10-LABEL: v_fcmp_f64_ueq: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-GFX10-NEXT: s_mov_b32 s4, 0 -; GISEL-GFX10-NEXT: s_mov_b32 s5, 0x40590000 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nlg_f64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX10-NEXT: v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3] ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -1368,10 +1312,8 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_o: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; SDAG-GFX11-NEXT: s_mov_b32 s4, 0 -; SDAG-GFX11-NEXT: s_mov_b32 s5, 0x40590000 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_o_f64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX11-NEXT: v_cmp_o_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1382,11 +1324,9 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; SDAG-GFX10-LABEL: v_fcmp_f64_o: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; SDAG-GFX10-NEXT: s_mov_b32 s4, 0 -; SDAG-GFX10-NEXT: s_mov_b32 s5, 0x40590000 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_o_f64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX10-NEXT: v_cmp_o_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -1394,11 +1334,9 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; GISEL-GFX11-LABEL: v_fcmp_f64_o: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GISEL-GFX11-NEXT: s_mov_b32 s4, 0 -; GISEL-GFX11-NEXT: s_mov_b32 s5, 0x40590000 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_o_f64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX11-NEXT: v_cmp_o_f64_e64 s2, 0x40590000, s[2:3] ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1409,11 +1347,9 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; GISEL-GFX10-LABEL: v_fcmp_f64_o: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-GFX10-NEXT: s_mov_b32 s4, 0 -; GISEL-GFX10-NEXT: s_mov_b32 s5, 0x40590000 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_o_f64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX10-NEXT: v_cmp_o_f64_e64 s2, 0x40590000, s[2:3] ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -1426,10 +1362,8 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_uo: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; SDAG-GFX11-NEXT: s_mov_b32 s4, 0 -; SDAG-GFX11-NEXT: s_mov_b32 s5, 0x40590000 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_u_f64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX11-NEXT: v_cmp_u_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1440,11 +1374,9 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; SDAG-GFX10-LABEL: v_fcmp_f64_uo: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; SDAG-GFX10-NEXT: s_mov_b32 s4, 0 -; SDAG-GFX10-NEXT: s_mov_b32 s5, 0x40590000 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_u_f64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX10-NEXT: v_cmp_u_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -1452,11 +1384,9 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; GISEL-GFX11-LABEL: v_fcmp_f64_uo: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GISEL-GFX11-NEXT: s_mov_b32 s4, 0 -; GISEL-GFX11-NEXT: s_mov_b32 s5, 0x40590000 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_u_f64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX11-NEXT: v_cmp_u_f64_e64 s2, 0x40590000, s[2:3] ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1467,11 +1397,9 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; GISEL-GFX10-LABEL: v_fcmp_f64_uo: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-GFX10-NEXT: s_mov_b32 s4, 0 -; GISEL-GFX10-NEXT: s_mov_b32 s5, 0x40590000 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_u_f64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX10-NEXT: v_cmp_u_f64_e64 s2, 0x40590000, s[2:3] ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -1484,10 +1412,8 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_une: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; SDAG-GFX11-NEXT: s_mov_b32 s4, 0 -; SDAG-GFX11-NEXT: s_mov_b32 s5, 0x40590000 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_neq_f64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX11-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1498,11 +1424,9 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; SDAG-GFX10-LABEL: v_fcmp_f64_une: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; SDAG-GFX10-NEXT: s_mov_b32 s4, 0 -; SDAG-GFX10-NEXT: s_mov_b32 s5, 0x40590000 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_neq_f64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -1510,11 +1434,9 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; GISEL-GFX11-LABEL: v_fcmp_f64_une: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GISEL-GFX11-NEXT: s_mov_b32 s4, 0 -; GISEL-GFX11-NEXT: s_mov_b32 s5, 0x40590000 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_neq_f64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX11-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1525,11 +1447,9 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; GISEL-GFX10-LABEL: v_fcmp_f64_une: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-GFX10-NEXT: s_mov_b32 s4, 0 -; GISEL-GFX10-NEXT: s_mov_b32 s5, 0x40590000 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_neq_f64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -1542,10 +1462,8 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_ugt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; SDAG-GFX11-NEXT: s_mov_b32 s4, 0 -; SDAG-GFX11-NEXT: s_mov_b32 s5, 0x40590000 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nle_f64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX11-NEXT: v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1556,11 +1474,9 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; SDAG-GFX10-LABEL: v_fcmp_f64_ugt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; SDAG-GFX10-NEXT: s_mov_b32 s4, 0 -; SDAG-GFX10-NEXT: s_mov_b32 s5, 0x40590000 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nle_f64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX10-NEXT: v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -1568,11 +1484,9 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; GISEL-GFX11-LABEL: v_fcmp_f64_ugt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GISEL-GFX11-NEXT: s_mov_b32 s4, 0 -; GISEL-GFX11-NEXT: s_mov_b32 s5, 0x40590000 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nle_f64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX11-NEXT: v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3] ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1583,11 +1497,9 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; GISEL-GFX10-LABEL: v_fcmp_f64_ugt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-GFX10-NEXT: s_mov_b32 s4, 0 -; GISEL-GFX10-NEXT: s_mov_b32 s5, 0x40590000 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nle_f64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX10-NEXT: v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3] ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -1600,10 +1512,8 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_uge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; SDAG-GFX11-NEXT: s_mov_b32 s4, 0 -; SDAG-GFX11-NEXT: s_mov_b32 s5, 0x40590000 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nlt_f64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX11-NEXT: v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1614,11 +1524,9 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; SDAG-GFX10-LABEL: v_fcmp_f64_uge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; SDAG-GFX10-NEXT: s_mov_b32 s4, 0 -; SDAG-GFX10-NEXT: s_mov_b32 s5, 0x40590000 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nlt_f64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX10-NEXT: v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -1626,11 +1534,9 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; GISEL-GFX11-LABEL: v_fcmp_f64_uge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GISEL-GFX11-NEXT: s_mov_b32 s4, 0 -; GISEL-GFX11-NEXT: s_mov_b32 s5, 0x40590000 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nlt_f64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX11-NEXT: v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3] ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1641,11 +1547,9 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; GISEL-GFX10-LABEL: v_fcmp_f64_uge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-GFX10-NEXT: s_mov_b32 s4, 0 -; GISEL-GFX10-NEXT: s_mov_b32 s5, 0x40590000 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nlt_f64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX10-NEXT: v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3] ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -1658,10 +1562,8 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_ult: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; SDAG-GFX11-NEXT: s_mov_b32 s4, 0 -; SDAG-GFX11-NEXT: s_mov_b32 s5, 0x40590000 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nge_f64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX11-NEXT: v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1672,11 +1574,9 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; SDAG-GFX10-LABEL: v_fcmp_f64_ult: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; SDAG-GFX10-NEXT: s_mov_b32 s4, 0 -; SDAG-GFX10-NEXT: s_mov_b32 s5, 0x40590000 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nge_f64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX10-NEXT: v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -1684,11 +1584,9 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; GISEL-GFX11-LABEL: v_fcmp_f64_ult: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GISEL-GFX11-NEXT: s_mov_b32 s4, 0 -; GISEL-GFX11-NEXT: s_mov_b32 s5, 0x40590000 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nge_f64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX11-NEXT: v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3] ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1699,11 +1597,9 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; GISEL-GFX10-LABEL: v_fcmp_f64_ult: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-GFX10-NEXT: s_mov_b32 s4, 0 -; GISEL-GFX10-NEXT: s_mov_b32 s5, 0x40590000 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nge_f64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX10-NEXT: v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3] ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -1716,10 +1612,8 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_ule: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; SDAG-GFX11-NEXT: s_mov_b32 s4, 0 -; SDAG-GFX11-NEXT: s_mov_b32 s5, 0x40590000 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ngt_f64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX11-NEXT: v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1730,11 +1624,9 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; SDAG-GFX10-LABEL: v_fcmp_f64_ule: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; SDAG-GFX10-NEXT: s_mov_b32 s4, 0 -; SDAG-GFX10-NEXT: s_mov_b32 s5, 0x40590000 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ngt_f64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX10-NEXT: v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -1742,11 +1634,9 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; GISEL-GFX11-LABEL: v_fcmp_f64_ule: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GISEL-GFX11-NEXT: s_mov_b32 s4, 0 -; GISEL-GFX11-NEXT: s_mov_b32 s5, 0x40590000 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ngt_f64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX11-NEXT: v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3] ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1757,11 +1647,9 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; GISEL-GFX10-LABEL: v_fcmp_f64_ule: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-GFX10-NEXT: s_mov_b32 s4, 0 -; GISEL-GFX10-NEXT: s_mov_b32 s5, 0x40590000 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ngt_f64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX10-NEXT: v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3] ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -2754,7 +2642,3 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { } attributes #0 = { nounwind readnone convergent } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GCN: {{.*}} -; GFX10: {{.*}} -; GFX11: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll index e2bdcfa6bbddc..8c76df3e041fd 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll @@ -1046,11 +1046,9 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_oeq: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_mov_b32 s5, 0x40590000 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_f64_e64 s[2:3], s[2:3], s[4:5] +; GFX11-NEXT: v_cmp_eq_f64_e64 s[2:3], 0x40590000, s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1059,32 +1057,18 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX9-SDAG-LABEL: v_fcmp_f64_oeq: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 -; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: v_cmp_eq_f64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-SDAG-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1] -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: v_fcmp_f64_oeq: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0x40590000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_cmp_eq_f64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9-GISEL-NEXT: s_endpgm +; GFX9-LABEL: v_fcmp_f64_oeq: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_f64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f64_oeq: ; VI-SDAG: ; %bb.0: @@ -1103,10 +1087,8 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; VI-GISEL-LABEL: v_fcmp_f64_oeq: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-GISEL-NEXT: s_mov_b32 s4, 0 -; VI-GISEL-NEXT: s_mov_b32 s5, 0x40590000 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_eq_f64_e64 s[2:3], s[2:3], v[0:1] ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -1124,11 +1106,9 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_one: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_mov_b32 s5, 0x40590000 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_neq_f64_e64 s[2:3], s[2:3], s[4:5] +; GFX11-NEXT: v_cmp_neq_f64_e64 s[2:3], 0x40590000, s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1137,32 +1117,18 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX9-SDAG-LABEL: v_fcmp_f64_one: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 -; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-SDAG-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1] -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: v_fcmp_f64_one: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0x40590000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9-GISEL-NEXT: s_endpgm +; GFX9-LABEL: v_fcmp_f64_one: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f64_one: ; VI-SDAG: ; %bb.0: @@ -1181,10 +1147,8 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; VI-GISEL-LABEL: v_fcmp_f64_one: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-GISEL-NEXT: s_mov_b32 s4, 0 -; VI-GISEL-NEXT: s_mov_b32 s5, 0x40590000 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1] ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -1202,11 +1166,9 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_ogt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_mov_b32 s5, 0x40590000 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f64_e64 s[2:3], s[2:3], s[4:5] +; GFX11-NEXT: v_cmp_lt_f64_e64 s[2:3], 0x40590000, s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1215,32 +1177,18 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX9-SDAG-LABEL: v_fcmp_f64_ogt: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 -; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: v_cmp_gt_f64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-SDAG-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1] -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: v_fcmp_f64_ogt: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0x40590000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_cmp_gt_f64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9-GISEL-NEXT: s_endpgm +; GFX9-LABEL: v_fcmp_f64_ogt: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_gt_f64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f64_ogt: ; VI-SDAG: ; %bb.0: @@ -1259,10 +1207,8 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; VI-GISEL-LABEL: v_fcmp_f64_ogt: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-GISEL-NEXT: s_mov_b32 s4, 0 -; VI-GISEL-NEXT: s_mov_b32 s5, 0x40590000 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_gt_f64_e64 s[2:3], s[2:3], v[0:1] ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -1280,11 +1226,9 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_oge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_mov_b32 s5, 0x40590000 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ge_f64_e64 s[2:3], s[2:3], s[4:5] +; GFX11-NEXT: v_cmp_le_f64_e64 s[2:3], 0x40590000, s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1293,32 +1237,18 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX9-SDAG-LABEL: v_fcmp_f64_oge: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 -; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: v_cmp_ge_f64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-SDAG-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1] -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: v_fcmp_f64_oge: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0x40590000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_cmp_ge_f64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9-GISEL-NEXT: s_endpgm +; GFX9-LABEL: v_fcmp_f64_oge: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_ge_f64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f64_oge: ; VI-SDAG: ; %bb.0: @@ -1337,10 +1267,8 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; VI-GISEL-LABEL: v_fcmp_f64_oge: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-GISEL-NEXT: s_mov_b32 s4, 0 -; VI-GISEL-NEXT: s_mov_b32 s5, 0x40590000 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_ge_f64_e64 s[2:3], s[2:3], v[0:1] ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -1358,11 +1286,9 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_olt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_mov_b32 s5, 0x40590000 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_f64_e64 s[2:3], s[2:3], s[4:5] +; GFX11-NEXT: v_cmp_gt_f64_e64 s[2:3], 0x40590000, s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1371,32 +1297,18 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX9-SDAG-LABEL: v_fcmp_f64_olt: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 -; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: v_cmp_lt_f64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-SDAG-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1] -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: v_fcmp_f64_olt: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0x40590000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_cmp_lt_f64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9-GISEL-NEXT: s_endpgm +; GFX9-LABEL: v_fcmp_f64_olt: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_lt_f64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f64_olt: ; VI-SDAG: ; %bb.0: @@ -1415,10 +1327,8 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; VI-GISEL-LABEL: v_fcmp_f64_olt: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-GISEL-NEXT: s_mov_b32 s4, 0 -; VI-GISEL-NEXT: s_mov_b32 s5, 0x40590000 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_lt_f64_e64 s[2:3], s[2:3], v[0:1] ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -1436,11 +1346,9 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_ole: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_mov_b32 s5, 0x40590000 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_le_f64_e64 s[2:3], s[2:3], s[4:5] +; GFX11-NEXT: v_cmp_ge_f64_e64 s[2:3], 0x40590000, s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1449,32 +1357,18 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX9-SDAG-LABEL: v_fcmp_f64_ole: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 -; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: v_cmp_le_f64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-SDAG-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1] -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: v_fcmp_f64_ole: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0x40590000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_cmp_le_f64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9-GISEL-NEXT: s_endpgm +; GFX9-LABEL: v_fcmp_f64_ole: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_le_f64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f64_ole: ; VI-SDAG: ; %bb.0: @@ -1493,10 +1387,8 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; VI-GISEL-LABEL: v_fcmp_f64_ole: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-GISEL-NEXT: s_mov_b32 s4, 0 -; VI-GISEL-NEXT: s_mov_b32 s5, 0x40590000 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_le_f64_e64 s[2:3], s[2:3], v[0:1] ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -1514,11 +1406,9 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_ueq: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_mov_b32 s5, 0x40590000 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f64_e64 s[2:3], s[2:3], s[4:5] +; GFX11-NEXT: v_cmp_nlg_f64_e64 s[2:3], 0x40590000, s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1527,32 +1417,18 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX9-SDAG-LABEL: v_fcmp_f64_ueq: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 -; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: v_cmp_nlg_f64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-SDAG-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1] -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: v_fcmp_f64_ueq: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0x40590000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_cmp_nlg_f64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9-GISEL-NEXT: s_endpgm +; GFX9-LABEL: v_fcmp_f64_ueq: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_nlg_f64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f64_ueq: ; VI-SDAG: ; %bb.0: @@ -1571,10 +1447,8 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; VI-GISEL-LABEL: v_fcmp_f64_ueq: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-GISEL-NEXT: s_mov_b32 s4, 0 -; VI-GISEL-NEXT: s_mov_b32 s5, 0x40590000 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_nlg_f64_e64 s[2:3], s[2:3], v[0:1] ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -1592,11 +1466,9 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_o: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_mov_b32 s5, 0x40590000 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_o_f64_e64 s[2:3], s[2:3], s[4:5] +; GFX11-NEXT: v_cmp_o_f64_e64 s[2:3], 0x40590000, s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1605,32 +1477,18 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX9-SDAG-LABEL: v_fcmp_f64_o: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 -; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: v_cmp_o_f64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-SDAG-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1] -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: v_fcmp_f64_o: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0x40590000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_cmp_o_f64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9-GISEL-NEXT: s_endpgm +; GFX9-LABEL: v_fcmp_f64_o: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_o_f64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f64_o: ; VI-SDAG: ; %bb.0: @@ -1649,10 +1507,8 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; VI-GISEL-LABEL: v_fcmp_f64_o: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-GISEL-NEXT: s_mov_b32 s4, 0 -; VI-GISEL-NEXT: s_mov_b32 s5, 0x40590000 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_o_f64_e64 s[2:3], s[2:3], v[0:1] ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -1670,11 +1526,9 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_uo: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_mov_b32 s5, 0x40590000 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_u_f64_e64 s[2:3], s[2:3], s[4:5] +; GFX11-NEXT: v_cmp_u_f64_e64 s[2:3], 0x40590000, s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1683,32 +1537,18 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX9-SDAG-LABEL: v_fcmp_f64_uo: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 -; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: v_cmp_u_f64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-SDAG-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1] -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: v_fcmp_f64_uo: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0x40590000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_cmp_u_f64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9-GISEL-NEXT: s_endpgm +; GFX9-LABEL: v_fcmp_f64_uo: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_u_f64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f64_uo: ; VI-SDAG: ; %bb.0: @@ -1727,10 +1567,8 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; VI-GISEL-LABEL: v_fcmp_f64_uo: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-GISEL-NEXT: s_mov_b32 s4, 0 -; VI-GISEL-NEXT: s_mov_b32 s5, 0x40590000 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_u_f64_e64 s[2:3], s[2:3], v[0:1] ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -1748,11 +1586,9 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_une: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_mov_b32 s5, 0x40590000 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_neq_f64_e64 s[2:3], s[2:3], s[4:5] +; GFX11-NEXT: v_cmp_neq_f64_e64 s[2:3], 0x40590000, s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1761,32 +1597,18 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX9-SDAG-LABEL: v_fcmp_f64_une: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 -; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-SDAG-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1] -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: v_fcmp_f64_une: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0x40590000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9-GISEL-NEXT: s_endpgm +; GFX9-LABEL: v_fcmp_f64_une: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f64_une: ; VI-SDAG: ; %bb.0: @@ -1805,10 +1627,8 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; VI-GISEL-LABEL: v_fcmp_f64_une: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-GISEL-NEXT: s_mov_b32 s4, 0 -; VI-GISEL-NEXT: s_mov_b32 s5, 0x40590000 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1] ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -1826,11 +1646,9 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_ugt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_mov_b32 s5, 0x40590000 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nle_f64_e64 s[2:3], s[2:3], s[4:5] +; GFX11-NEXT: v_cmp_nge_f64_e64 s[2:3], 0x40590000, s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1839,32 +1657,18 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX9-SDAG-LABEL: v_fcmp_f64_ugt: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 -; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: v_cmp_nle_f64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-SDAG-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1] -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: v_fcmp_f64_ugt: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0x40590000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_cmp_nle_f64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9-GISEL-NEXT: s_endpgm +; GFX9-LABEL: v_fcmp_f64_ugt: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_nle_f64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f64_ugt: ; VI-SDAG: ; %bb.0: @@ -1883,10 +1687,8 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; VI-GISEL-LABEL: v_fcmp_f64_ugt: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-GISEL-NEXT: s_mov_b32 s4, 0 -; VI-GISEL-NEXT: s_mov_b32 s5, 0x40590000 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_nle_f64_e64 s[2:3], s[2:3], v[0:1] ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -1904,11 +1706,9 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_uge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_mov_b32 s5, 0x40590000 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlt_f64_e64 s[2:3], s[2:3], s[4:5] +; GFX11-NEXT: v_cmp_ngt_f64_e64 s[2:3], 0x40590000, s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1917,32 +1717,18 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX9-SDAG-LABEL: v_fcmp_f64_uge: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 -; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: v_cmp_nlt_f64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-SDAG-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1] -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: v_fcmp_f64_uge: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0x40590000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_cmp_nlt_f64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9-GISEL-NEXT: s_endpgm +; GFX9-LABEL: v_fcmp_f64_uge: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_nlt_f64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f64_uge: ; VI-SDAG: ; %bb.0: @@ -1961,10 +1747,8 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; VI-GISEL-LABEL: v_fcmp_f64_uge: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-GISEL-NEXT: s_mov_b32 s4, 0 -; VI-GISEL-NEXT: s_mov_b32 s5, 0x40590000 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_nlt_f64_e64 s[2:3], s[2:3], v[0:1] ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -1982,11 +1766,9 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_ult: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_mov_b32 s5, 0x40590000 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nge_f64_e64 s[2:3], s[2:3], s[4:5] +; GFX11-NEXT: v_cmp_nle_f64_e64 s[2:3], 0x40590000, s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1995,32 +1777,18 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX9-SDAG-LABEL: v_fcmp_f64_ult: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 -; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: v_cmp_nge_f64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-SDAG-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1] -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: v_fcmp_f64_ult: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0x40590000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_cmp_nge_f64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9-GISEL-NEXT: s_endpgm +; GFX9-LABEL: v_fcmp_f64_ult: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_nge_f64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f64_ult: ; VI-SDAG: ; %bb.0: @@ -2039,10 +1807,8 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; VI-GISEL-LABEL: v_fcmp_f64_ult: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-GISEL-NEXT: s_mov_b32 s4, 0 -; VI-GISEL-NEXT: s_mov_b32 s5, 0x40590000 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_nge_f64_e64 s[2:3], s[2:3], v[0:1] ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -2060,11 +1826,9 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_ule: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_mov_b32 s5, 0x40590000 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ngt_f64_e64 s[2:3], s[2:3], s[4:5] +; GFX11-NEXT: v_cmp_nlt_f64_e64 s[2:3], 0x40590000, s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2073,32 +1837,18 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX9-SDAG-LABEL: v_fcmp_f64_ule: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 -; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: v_cmp_ngt_f64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-SDAG-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1] -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: v_fcmp_f64_ule: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0x40590000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_cmp_ngt_f64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9-GISEL-NEXT: s_endpgm +; GFX9-LABEL: v_fcmp_f64_ule: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_ngt_f64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f64_ule: ; VI-SDAG: ; %bb.0: @@ -2117,10 +1867,8 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; VI-GISEL-LABEL: v_fcmp_f64_ule: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-GISEL-NEXT: s_mov_b32 s4, 0 -; VI-GISEL-NEXT: s_mov_b32 s5, 0x40590000 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_ngt_f64_e64 s[2:3], s[2:3], v[0:1] ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll index 44d1cfb96146e..7a492f51cce49 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll @@ -633,9 +633,8 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_i64_eq: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; SDAG-GFX11-NEXT: s_mov_b64 s[4:5], 0x64 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_eq_u64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX11-NEXT: v_cmp_eq_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -646,10 +645,9 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX10-LABEL: v_icmp_i64_eq: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; SDAG-GFX10-NEXT: s_mov_b64 s[4:5], 0x64 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_eq_u64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX10-NEXT: v_cmp_eq_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -657,10 +655,9 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; GISEL-GFX11-LABEL: v_icmp_i64_eq: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GISEL-GFX11-NEXT: s_mov_b64 s[4:5], 0x64 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_eq_u64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX11-NEXT: v_cmp_eq_u64_e64 s2, 0x64, s[2:3] ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -671,10 +668,9 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; GISEL-GFX10-LABEL: v_icmp_i64_eq: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-GFX10-NEXT: s_mov_b64 s[4:5], 0x64 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_eq_u64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX10-NEXT: v_cmp_eq_u64_e64 s2, 0x64, s[2:3] ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -687,9 +683,8 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_i64_ne: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; SDAG-GFX11-NEXT: s_mov_b64 s[4:5], 0x64 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ne_u64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX11-NEXT: v_cmp_ne_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -700,10 +695,9 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX10-LABEL: v_icmp_i64_ne: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; SDAG-GFX10-NEXT: s_mov_b64 s[4:5], 0x64 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ne_u64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX10-NEXT: v_cmp_ne_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -711,10 +705,9 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; GISEL-GFX11-LABEL: v_icmp_i64_ne: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GISEL-GFX11-NEXT: s_mov_b64 s[4:5], 0x64 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ne_u64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX11-NEXT: v_cmp_ne_u64_e64 s2, 0x64, s[2:3] ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -725,10 +718,9 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; GISEL-GFX10-LABEL: v_icmp_i64_ne: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-GFX10-NEXT: s_mov_b64 s[4:5], 0x64 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ne_u64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX10-NEXT: v_cmp_ne_u64_e64 s2, 0x64, s[2:3] ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -741,9 +733,8 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_u64_ugt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; SDAG-GFX11-NEXT: s_mov_b64 s[4:5], 0x64 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_gt_u64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX11-NEXT: v_cmp_lt_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -754,10 +745,9 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX10-LABEL: v_icmp_u64_ugt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; SDAG-GFX10-NEXT: s_mov_b64 s[4:5], 0x64 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX10-NEXT: v_cmp_lt_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -765,10 +755,9 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; GISEL-GFX11-LABEL: v_icmp_u64_ugt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GISEL-GFX11-NEXT: s_mov_b64 s[4:5], 0x64 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_gt_u64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX11-NEXT: v_cmp_lt_u64_e64 s2, 0x64, s[2:3] ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -779,10 +768,9 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; GISEL-GFX10-LABEL: v_icmp_u64_ugt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-GFX10-NEXT: s_mov_b64 s[4:5], 0x64 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX10-NEXT: v_cmp_lt_u64_e64 s2, 0x64, s[2:3] ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -795,9 +783,8 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_u64_uge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; SDAG-GFX11-NEXT: s_mov_b64 s[4:5], 0x64 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ge_u64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX11-NEXT: v_cmp_le_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -808,10 +795,9 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX10-LABEL: v_icmp_u64_uge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; SDAG-GFX10-NEXT: s_mov_b64 s[4:5], 0x64 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_u64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX10-NEXT: v_cmp_le_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -819,10 +805,9 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; GISEL-GFX11-LABEL: v_icmp_u64_uge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GISEL-GFX11-NEXT: s_mov_b64 s[4:5], 0x64 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ge_u64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX11-NEXT: v_cmp_le_u64_e64 s2, 0x64, s[2:3] ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -833,10 +818,9 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; GISEL-GFX10-LABEL: v_icmp_u64_uge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-GFX10-NEXT: s_mov_b64 s[4:5], 0x64 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_u64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX10-NEXT: v_cmp_le_u64_e64 s2, 0x64, s[2:3] ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -849,9 +833,8 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_u64_ult: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; SDAG-GFX11-NEXT: s_mov_b64 s[4:5], 0x64 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_lt_u64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX11-NEXT: v_cmp_gt_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -862,10 +845,9 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX10-LABEL: v_icmp_u64_ult: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; SDAG-GFX10-NEXT: s_mov_b64 s[4:5], 0x64 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_u64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX10-NEXT: v_cmp_gt_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -873,10 +855,9 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; GISEL-GFX11-LABEL: v_icmp_u64_ult: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GISEL-GFX11-NEXT: s_mov_b64 s[4:5], 0x64 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_lt_u64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX11-NEXT: v_cmp_gt_u64_e64 s2, 0x64, s[2:3] ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -887,10 +868,9 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; GISEL-GFX10-LABEL: v_icmp_u64_ult: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-GFX10-NEXT: s_mov_b64 s[4:5], 0x64 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_u64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX10-NEXT: v_cmp_gt_u64_e64 s2, 0x64, s[2:3] ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -903,9 +883,8 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_u64_ule: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; SDAG-GFX11-NEXT: s_mov_b64 s[4:5], 0x64 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_le_u64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX11-NEXT: v_cmp_ge_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -916,10 +895,9 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX10-LABEL: v_icmp_u64_ule: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; SDAG-GFX10-NEXT: s_mov_b64 s[4:5], 0x64 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_u64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX10-NEXT: v_cmp_ge_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -927,10 +905,9 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; GISEL-GFX11-LABEL: v_icmp_u64_ule: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GISEL-GFX11-NEXT: s_mov_b64 s[4:5], 0x64 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_le_u64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX11-NEXT: v_cmp_ge_u64_e64 s2, 0x64, s[2:3] ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -941,10 +918,9 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; GISEL-GFX10-LABEL: v_icmp_u64_ule: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-GFX10-NEXT: s_mov_b64 s[4:5], 0x64 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_u64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX10-NEXT: v_cmp_ge_u64_e64 s2, 0x64, s[2:3] ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -957,9 +933,8 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_i64_sgt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; SDAG-GFX11-NEXT: s_mov_b64 s[4:5], 0x64 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_gt_i64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX11-NEXT: v_cmp_lt_i64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -970,10 +945,9 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX10-LABEL: v_icmp_i64_sgt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; SDAG-GFX10-NEXT: s_mov_b64 s[4:5], 0x64 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX10-NEXT: v_cmp_lt_i64_e64 s2, 0x64, s[2:3] ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -981,10 +955,9 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; GISEL-GFX11-LABEL: v_icmp_i64_sgt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GISEL-GFX11-NEXT: s_mov_b64 s[4:5], 0x64 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_gt_i64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX11-NEXT: v_cmp_lt_i64_e64 s2, 0x64, s[2:3] ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -995,10 +968,9 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; GISEL-GFX10-LABEL: v_icmp_i64_sgt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-GFX10-NEXT: s_mov_b64 s[4:5], 0x64 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX10-NEXT: v_cmp_lt_i64_e64 s2, 0x64, s[2:3] ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -1011,9 +983,8 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_i64_sge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; SDAG-GFX11-NEXT: s_mov_b64 s[4:5], 0x64 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ge_i64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX11-NEXT: v_cmp_le_i64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1024,10 +995,9 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX10-LABEL: v_icmp_i64_sge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; SDAG-GFX10-NEXT: s_mov_b64 s[4:5], 0x64 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_i64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX10-NEXT: v_cmp_le_i64_e64 s2, 0x64, s[2:3] ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -1035,10 +1005,9 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; GISEL-GFX11-LABEL: v_icmp_i64_sge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GISEL-GFX11-NEXT: s_mov_b64 s[4:5], 0x64 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ge_i64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX11-NEXT: v_cmp_le_i64_e64 s2, 0x64, s[2:3] ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1049,10 +1018,9 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; GISEL-GFX10-LABEL: v_icmp_i64_sge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-GFX10-NEXT: s_mov_b64 s[4:5], 0x64 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_i64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX10-NEXT: v_cmp_le_i64_e64 s2, 0x64, s[2:3] ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -1065,9 +1033,8 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_i64_slt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; SDAG-GFX11-NEXT: s_mov_b64 s[4:5], 0x64 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX11-NEXT: v_cmp_gt_i64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1078,10 +1045,9 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX10-LABEL: v_icmp_i64_slt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; SDAG-GFX10-NEXT: s_mov_b64 s[4:5], 0x64 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX10-NEXT: v_cmp_gt_i64_e64 s2, 0x64, s[2:3] ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -1089,10 +1055,9 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; GISEL-GFX11-LABEL: v_icmp_i64_slt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GISEL-GFX11-NEXT: s_mov_b64 s[4:5], 0x64 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX11-NEXT: v_cmp_gt_i64_e64 s2, 0x64, s[2:3] ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1103,10 +1068,9 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; GISEL-GFX10-LABEL: v_icmp_i64_slt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-GFX10-NEXT: s_mov_b64 s[4:5], 0x64 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX10-NEXT: v_cmp_gt_i64_e64 s2, 0x64, s[2:3] ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -1119,9 +1083,8 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_i64_sle: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; SDAG-GFX11-NEXT: s_mov_b64 s[4:5], 0x64 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_le_i64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX11-NEXT: v_cmp_ge_i64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1132,10 +1095,9 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX10-LABEL: v_icmp_i64_sle: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; SDAG-GFX10-NEXT: s_mov_b64 s[4:5], 0x64 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_i64_e64 s2, s[2:3], s[4:5] +; SDAG-GFX10-NEXT: v_cmp_ge_i64_e64 s2, 0x64, s[2:3] ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -1143,10 +1105,9 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; GISEL-GFX11-LABEL: v_icmp_i64_sle: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GISEL-GFX11-NEXT: s_mov_b64 s[4:5], 0x64 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_le_i64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX11-NEXT: v_cmp_ge_i64_e64 s2, 0x64, s[2:3] ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1157,10 +1118,9 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; GISEL-GFX10-LABEL: v_icmp_i64_sle: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-GFX10-NEXT: s_mov_b64 s[4:5], 0x64 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_i64_e64 s2, s[2:3], s[4:5] +; GISEL-GFX10-NEXT: v_cmp_ge_i64_e64 s2, 0x64, s[2:3] ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll index 5d1aa7cbb9992..80e0202962462 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll @@ -689,10 +689,9 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_i64_eq: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b64 s[4:5], 0x64 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], s[4:5] +; GFX11-NEXT: v_cmp_eq_u64_e64 s[2:3], 0x64, s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -715,24 +714,24 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SDAG-VI-NEXT: s_endpgm ; -; SDAG-GFX9-LABEL: v_icmp_i64_eq: -; SDAG-GFX9: ; %bb.0: -; SDAG-GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; SDAG-GFX9-NEXT: v_mov_b32_e32 v1, 0 -; SDAG-GFX9-NEXT: v_mov_b32_e32 v0, 0x64 -; SDAG-GFX9-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX9-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], v[0:1] -; SDAG-GFX9-NEXT: v_mov_b32_e32 v2, s2 -; SDAG-GFX9-NEXT: v_mov_b32_e32 v3, s3 -; SDAG-GFX9-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1] -; SDAG-GFX9-NEXT: s_endpgm +; GFX9-LABEL: v_icmp_i64_eq: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i64_eq: ; GISEL-VI: ; %bb.0: ; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-VI-NEXT: s_mov_b64 s[4:5], 0x64 -; GISEL-VI-NEXT: v_mov_b32_e32 v0, s4 -; GISEL-VI-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 +; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], v[0:1] ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 @@ -741,20 +740,6 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm -; -; GISEL-GFX9-LABEL: v_icmp_i64_eq: -; GISEL-GFX9: ; %bb.0: -; GISEL-GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-GFX9-NEXT: s_mov_b64 s[4:5], 0x64 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GISEL-GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX9-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], v[0:1] -; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GISEL-GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GISEL-GFX9-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 32) store i64 %result, ptr addrspace(1) %out ret void @@ -764,10 +749,9 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_i64_ne: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b64 s[4:5], 0x64 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u64_e64 s[2:3], s[2:3], s[4:5] +; GFX11-NEXT: v_cmp_ne_u64_e64 s[2:3], 0x64, s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -790,24 +774,24 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SDAG-VI-NEXT: s_endpgm ; -; SDAG-GFX9-LABEL: v_icmp_i64_ne: -; SDAG-GFX9: ; %bb.0: -; SDAG-GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; SDAG-GFX9-NEXT: v_mov_b32_e32 v1, 0 -; SDAG-GFX9-NEXT: v_mov_b32_e32 v0, 0x64 -; SDAG-GFX9-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX9-NEXT: v_cmp_ne_u64_e64 s[2:3], s[2:3], v[0:1] -; SDAG-GFX9-NEXT: v_mov_b32_e32 v2, s2 -; SDAG-GFX9-NEXT: v_mov_b32_e32 v3, s3 -; SDAG-GFX9-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1] -; SDAG-GFX9-NEXT: s_endpgm +; GFX9-LABEL: v_icmp_i64_ne: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i64_ne: ; GISEL-VI: ; %bb.0: ; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-VI-NEXT: s_mov_b64 s[4:5], 0x64 -; GISEL-VI-NEXT: v_mov_b32_e32 v0, s4 -; GISEL-VI-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 +; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_ne_u64_e64 s[2:3], s[2:3], v[0:1] ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 @@ -816,20 +800,6 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm -; -; GISEL-GFX9-LABEL: v_icmp_i64_ne: -; GISEL-GFX9: ; %bb.0: -; GISEL-GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-GFX9-NEXT: s_mov_b64 s[4:5], 0x64 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GISEL-GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX9-NEXT: v_cmp_ne_u64_e64 s[2:3], s[2:3], v[0:1] -; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GISEL-GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GISEL-GFX9-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 33) store i64 %result, ptr addrspace(1) %out ret void @@ -839,10 +809,9 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_u64_ugt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b64 s[4:5], 0x64 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_u64_e64 s[2:3], s[2:3], s[4:5] +; GFX11-NEXT: v_cmp_lt_u64_e64 s[2:3], 0x64, s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -865,24 +834,24 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SDAG-VI-NEXT: s_endpgm ; -; SDAG-GFX9-LABEL: v_icmp_u64_ugt: -; SDAG-GFX9: ; %bb.0: -; SDAG-GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; SDAG-GFX9-NEXT: v_mov_b32_e32 v1, 0 -; SDAG-GFX9-NEXT: v_mov_b32_e32 v0, 0x64 -; SDAG-GFX9-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX9-NEXT: v_cmp_gt_u64_e64 s[2:3], s[2:3], v[0:1] -; SDAG-GFX9-NEXT: v_mov_b32_e32 v2, s2 -; SDAG-GFX9-NEXT: v_mov_b32_e32 v3, s3 -; SDAG-GFX9-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1] -; SDAG-GFX9-NEXT: s_endpgm +; GFX9-LABEL: v_icmp_u64_ugt: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_gt_u64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_u64_ugt: ; GISEL-VI: ; %bb.0: ; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-VI-NEXT: s_mov_b64 s[4:5], 0x64 -; GISEL-VI-NEXT: v_mov_b32_e32 v0, s4 -; GISEL-VI-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 +; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_gt_u64_e64 s[2:3], s[2:3], v[0:1] ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 @@ -891,20 +860,6 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm -; -; GISEL-GFX9-LABEL: v_icmp_u64_ugt: -; GISEL-GFX9: ; %bb.0: -; GISEL-GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-GFX9-NEXT: s_mov_b64 s[4:5], 0x64 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GISEL-GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX9-NEXT: v_cmp_gt_u64_e64 s[2:3], s[2:3], v[0:1] -; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GISEL-GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GISEL-GFX9-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 34) store i64 %result, ptr addrspace(1) %out ret void @@ -914,10 +869,9 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_u64_uge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b64 s[4:5], 0x64 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ge_u64_e64 s[2:3], s[2:3], s[4:5] +; GFX11-NEXT: v_cmp_le_u64_e64 s[2:3], 0x64, s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -940,24 +894,24 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SDAG-VI-NEXT: s_endpgm ; -; SDAG-GFX9-LABEL: v_icmp_u64_uge: -; SDAG-GFX9: ; %bb.0: -; SDAG-GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; SDAG-GFX9-NEXT: v_mov_b32_e32 v1, 0 -; SDAG-GFX9-NEXT: v_mov_b32_e32 v0, 0x64 -; SDAG-GFX9-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX9-NEXT: v_cmp_ge_u64_e64 s[2:3], s[2:3], v[0:1] -; SDAG-GFX9-NEXT: v_mov_b32_e32 v2, s2 -; SDAG-GFX9-NEXT: v_mov_b32_e32 v3, s3 -; SDAG-GFX9-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1] -; SDAG-GFX9-NEXT: s_endpgm +; GFX9-LABEL: v_icmp_u64_uge: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_ge_u64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_u64_uge: ; GISEL-VI: ; %bb.0: ; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-VI-NEXT: s_mov_b64 s[4:5], 0x64 -; GISEL-VI-NEXT: v_mov_b32_e32 v0, s4 -; GISEL-VI-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 +; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_ge_u64_e64 s[2:3], s[2:3], v[0:1] ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 @@ -966,20 +920,6 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm -; -; GISEL-GFX9-LABEL: v_icmp_u64_uge: -; GISEL-GFX9: ; %bb.0: -; GISEL-GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-GFX9-NEXT: s_mov_b64 s[4:5], 0x64 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GISEL-GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX9-NEXT: v_cmp_ge_u64_e64 s[2:3], s[2:3], v[0:1] -; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GISEL-GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GISEL-GFX9-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 35) store i64 %result, ptr addrspace(1) %out ret void @@ -989,10 +929,9 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_u64_ult: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b64 s[4:5], 0x64 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_u64_e64 s[2:3], s[2:3], s[4:5] +; GFX11-NEXT: v_cmp_gt_u64_e64 s[2:3], 0x64, s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1015,24 +954,24 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SDAG-VI-NEXT: s_endpgm ; -; SDAG-GFX9-LABEL: v_icmp_u64_ult: -; SDAG-GFX9: ; %bb.0: -; SDAG-GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; SDAG-GFX9-NEXT: v_mov_b32_e32 v1, 0 -; SDAG-GFX9-NEXT: v_mov_b32_e32 v0, 0x64 -; SDAG-GFX9-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX9-NEXT: v_cmp_lt_u64_e64 s[2:3], s[2:3], v[0:1] -; SDAG-GFX9-NEXT: v_mov_b32_e32 v2, s2 -; SDAG-GFX9-NEXT: v_mov_b32_e32 v3, s3 -; SDAG-GFX9-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1] -; SDAG-GFX9-NEXT: s_endpgm +; GFX9-LABEL: v_icmp_u64_ult: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_lt_u64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_u64_ult: ; GISEL-VI: ; %bb.0: ; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-VI-NEXT: s_mov_b64 s[4:5], 0x64 -; GISEL-VI-NEXT: v_mov_b32_e32 v0, s4 -; GISEL-VI-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 +; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_lt_u64_e64 s[2:3], s[2:3], v[0:1] ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 @@ -1041,20 +980,6 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm -; -; GISEL-GFX9-LABEL: v_icmp_u64_ult: -; GISEL-GFX9: ; %bb.0: -; GISEL-GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-GFX9-NEXT: s_mov_b64 s[4:5], 0x64 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GISEL-GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX9-NEXT: v_cmp_lt_u64_e64 s[2:3], s[2:3], v[0:1] -; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GISEL-GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GISEL-GFX9-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 36) store i64 %result, ptr addrspace(1) %out ret void @@ -1064,10 +989,9 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_u64_ule: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b64 s[4:5], 0x64 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_le_u64_e64 s[2:3], s[2:3], s[4:5] +; GFX11-NEXT: v_cmp_ge_u64_e64 s[2:3], 0x64, s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1090,24 +1014,24 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SDAG-VI-NEXT: s_endpgm ; -; SDAG-GFX9-LABEL: v_icmp_u64_ule: -; SDAG-GFX9: ; %bb.0: -; SDAG-GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; SDAG-GFX9-NEXT: v_mov_b32_e32 v1, 0 -; SDAG-GFX9-NEXT: v_mov_b32_e32 v0, 0x64 -; SDAG-GFX9-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX9-NEXT: v_cmp_le_u64_e64 s[2:3], s[2:3], v[0:1] -; SDAG-GFX9-NEXT: v_mov_b32_e32 v2, s2 -; SDAG-GFX9-NEXT: v_mov_b32_e32 v3, s3 -; SDAG-GFX9-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1] -; SDAG-GFX9-NEXT: s_endpgm +; GFX9-LABEL: v_icmp_u64_ule: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_le_u64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_u64_ule: ; GISEL-VI: ; %bb.0: ; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-VI-NEXT: s_mov_b64 s[4:5], 0x64 -; GISEL-VI-NEXT: v_mov_b32_e32 v0, s4 -; GISEL-VI-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 +; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_le_u64_e64 s[2:3], s[2:3], v[0:1] ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 @@ -1116,20 +1040,6 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm -; -; GISEL-GFX9-LABEL: v_icmp_u64_ule: -; GISEL-GFX9: ; %bb.0: -; GISEL-GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-GFX9-NEXT: s_mov_b64 s[4:5], 0x64 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GISEL-GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX9-NEXT: v_cmp_le_u64_e64 s[2:3], s[2:3], v[0:1] -; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GISEL-GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GISEL-GFX9-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 37) store i64 %result, ptr addrspace(1) %out ret void @@ -1139,10 +1049,9 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_i64_sgt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b64 s[4:5], 0x64 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_i64_e64 s[2:3], s[2:3], s[4:5] +; GFX11-NEXT: v_cmp_lt_i64_e64 s[2:3], 0x64, s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1165,24 +1074,24 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SDAG-VI-NEXT: s_endpgm ; -; SDAG-GFX9-LABEL: v_icmp_i64_sgt: -; SDAG-GFX9: ; %bb.0: -; SDAG-GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; SDAG-GFX9-NEXT: v_mov_b32_e32 v1, 0 -; SDAG-GFX9-NEXT: v_mov_b32_e32 v0, 0x64 -; SDAG-GFX9-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[2:3], v[0:1] -; SDAG-GFX9-NEXT: v_mov_b32_e32 v2, s2 -; SDAG-GFX9-NEXT: v_mov_b32_e32 v3, s3 -; SDAG-GFX9-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1] -; SDAG-GFX9-NEXT: s_endpgm +; GFX9-LABEL: v_icmp_i64_sgt: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i64_sgt: ; GISEL-VI: ; %bb.0: ; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-VI-NEXT: s_mov_b64 s[4:5], 0x64 -; GISEL-VI-NEXT: v_mov_b32_e32 v0, s4 -; GISEL-VI-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 +; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_gt_i64_e64 s[2:3], s[2:3], v[0:1] ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 @@ -1191,20 +1100,6 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm -; -; GISEL-GFX9-LABEL: v_icmp_i64_sgt: -; GISEL-GFX9: ; %bb.0: -; GISEL-GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-GFX9-NEXT: s_mov_b64 s[4:5], 0x64 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GISEL-GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[2:3], v[0:1] -; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GISEL-GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GISEL-GFX9-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 38) store i64 %result, ptr addrspace(1) %out ret void @@ -1214,10 +1109,9 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_i64_sge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b64 s[4:5], 0x64 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ge_i64_e64 s[2:3], s[2:3], s[4:5] +; GFX11-NEXT: v_cmp_le_i64_e64 s[2:3], 0x64, s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1240,24 +1134,24 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SDAG-VI-NEXT: s_endpgm ; -; SDAG-GFX9-LABEL: v_icmp_i64_sge: -; SDAG-GFX9: ; %bb.0: -; SDAG-GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; SDAG-GFX9-NEXT: v_mov_b32_e32 v1, 0 -; SDAG-GFX9-NEXT: v_mov_b32_e32 v0, 0x64 -; SDAG-GFX9-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX9-NEXT: v_cmp_ge_i64_e64 s[2:3], s[2:3], v[0:1] -; SDAG-GFX9-NEXT: v_mov_b32_e32 v2, s2 -; SDAG-GFX9-NEXT: v_mov_b32_e32 v3, s3 -; SDAG-GFX9-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1] -; SDAG-GFX9-NEXT: s_endpgm +; GFX9-LABEL: v_icmp_i64_sge: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_ge_i64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i64_sge: ; GISEL-VI: ; %bb.0: ; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-VI-NEXT: s_mov_b64 s[4:5], 0x64 -; GISEL-VI-NEXT: v_mov_b32_e32 v0, s4 -; GISEL-VI-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 +; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_ge_i64_e64 s[2:3], s[2:3], v[0:1] ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 @@ -1266,20 +1160,6 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm -; -; GISEL-GFX9-LABEL: v_icmp_i64_sge: -; GISEL-GFX9: ; %bb.0: -; GISEL-GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-GFX9-NEXT: s_mov_b64 s[4:5], 0x64 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GISEL-GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX9-NEXT: v_cmp_ge_i64_e64 s[2:3], s[2:3], v[0:1] -; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GISEL-GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GISEL-GFX9-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 39) store i64 %result, ptr addrspace(1) %out ret void @@ -1289,10 +1169,9 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_i64_slt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b64 s[4:5], 0x64 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_i64_e64 s[2:3], s[2:3], s[4:5] +; GFX11-NEXT: v_cmp_gt_i64_e64 s[2:3], 0x64, s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1315,24 +1194,24 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SDAG-VI-NEXT: s_endpgm ; -; SDAG-GFX9-LABEL: v_icmp_i64_slt: -; SDAG-GFX9: ; %bb.0: -; SDAG-GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; SDAG-GFX9-NEXT: v_mov_b32_e32 v1, 0 -; SDAG-GFX9-NEXT: v_mov_b32_e32 v0, 0x64 -; SDAG-GFX9-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[2:3], v[0:1] -; SDAG-GFX9-NEXT: v_mov_b32_e32 v2, s2 -; SDAG-GFX9-NEXT: v_mov_b32_e32 v3, s3 -; SDAG-GFX9-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1] -; SDAG-GFX9-NEXT: s_endpgm +; GFX9-LABEL: v_icmp_i64_slt: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i64_slt: ; GISEL-VI: ; %bb.0: ; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-VI-NEXT: s_mov_b64 s[4:5], 0x64 -; GISEL-VI-NEXT: v_mov_b32_e32 v0, s4 -; GISEL-VI-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 +; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_lt_i64_e64 s[2:3], s[2:3], v[0:1] ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 @@ -1341,20 +1220,6 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm -; -; GISEL-GFX9-LABEL: v_icmp_i64_slt: -; GISEL-GFX9: ; %bb.0: -; GISEL-GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-GFX9-NEXT: s_mov_b64 s[4:5], 0x64 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GISEL-GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[2:3], v[0:1] -; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GISEL-GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GISEL-GFX9-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 40) store i64 %result, ptr addrspace(1) %out ret void @@ -1364,10 +1229,9 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_i64_sle: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b64 s[4:5], 0x64 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_le_i64_e64 s[2:3], s[2:3], s[4:5] +; GFX11-NEXT: v_cmp_ge_i64_e64 s[2:3], 0x64, s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1390,24 +1254,24 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SDAG-VI-NEXT: s_endpgm ; -; SDAG-GFX9-LABEL: v_icmp_i64_sle: -; SDAG-GFX9: ; %bb.0: -; SDAG-GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; SDAG-GFX9-NEXT: v_mov_b32_e32 v1, 0 -; SDAG-GFX9-NEXT: v_mov_b32_e32 v0, 0x64 -; SDAG-GFX9-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX9-NEXT: v_cmp_le_i64_e64 s[2:3], s[2:3], v[0:1] -; SDAG-GFX9-NEXT: v_mov_b32_e32 v2, s2 -; SDAG-GFX9-NEXT: v_mov_b32_e32 v3, s3 -; SDAG-GFX9-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1] -; SDAG-GFX9-NEXT: s_endpgm +; GFX9-LABEL: v_icmp_i64_sle: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_le_i64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i64_sle: ; GISEL-VI: ; %bb.0: ; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-VI-NEXT: s_mov_b64 s[4:5], 0x64 -; GISEL-VI-NEXT: v_mov_b32_e32 v0, s4 -; GISEL-VI-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 +; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_le_i64_e64 s[2:3], s[2:3], v[0:1] ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 @@ -1416,20 +1280,6 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm -; -; GISEL-GFX9-LABEL: v_icmp_i64_sle: -; GISEL-GFX9: ; %bb.0: -; GISEL-GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-GFX9-NEXT: s_mov_b64 s[4:5], 0x64 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GISEL-GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX9-NEXT: v_cmp_le_i64_e64 s[2:3], s[2:3], v[0:1] -; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GISEL-GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GISEL-GFX9-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 41) store i64 %result, ptr addrspace(1) %out ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll index 2be94f4dbc650..6e6a3cbdd4887 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll @@ -47,9 +47,7 @@ define amdgpu_kernel void @rsq_f64_constant_4.0(ptr addrspace(1) %out) #1 { } ; FUNC-LABEL: {{^}}rsq_f64_constant_100.0 -; SI-DAG: s_mov_b32 s{{[0-9]+}}, 0x40590000 -; SI-DAG: s_mov_b32 s{{[0-9]+}}, 0{{$}} -; SI: v_rsq_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +; SI: v_rsq_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, 0x40590000 define amdgpu_kernel void @rsq_f64_constant_100.0(ptr addrspace(1) %out) #1 { %rsq = call double @llvm.amdgcn.rsq.f64(double 100.0) #0 store double %rsq, ptr addrspace(1) %out, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll index 684edd27536b5..16b1ccf58cf6a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll @@ -898,12 +898,13 @@ define { double, i32 } @test_frexp_f64_i32(double %a) { ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-SDAG-NEXT: s_mov_b32 s4, 0 ; GFX6-SDAG-NEXT: s_mov_b32 s5, 0x7ff00000 -; GFX6-SDAG-NEXT: v_frexp_exp_i32_f64_e32 v2, v[0:1] ; GFX6-SDAG-NEXT: v_frexp_mant_f64_e32 v[3:4], v[0:1] ; GFX6-SDAG-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] -; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc +; GFX6-SDAG-NEXT: v_frexp_exp_i32_f64_e32 v0, v[0:1] ; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc +; GFX6-SDAG-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_frexp_f64_i32: @@ -936,11 +937,11 @@ define { double, i32 } @test_frexp_f64_i32(double %a) { ; GFX6-GISEL-LABEL: test_frexp_f64_i32: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX6-GISEL-NEXT: s_mov_b32 s5, 0x7ff00000 +; GFX6-GISEL-NEXT: v_mov_b32_e32 v5, 0 +; GFX6-GISEL-NEXT: v_mov_b32_e32 v6, 0x7ff00000 ; GFX6-GISEL-NEXT: v_frexp_mant_f64_e32 v[3:4], v[0:1] ; GFX6-GISEL-NEXT: v_frexp_exp_i32_f64_e32 v2, v[0:1] -; GFX6-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] +; GFX6-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[5:6] ; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc @@ -950,16 +951,16 @@ define { double, i32 } @test_frexp_f64_i32(double %a) { } define double @test_frexp_f64_i32_only_use_fract(double %a) { -; GFX6-LABEL: test_frexp_f64_i32_only_use_fract: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0 -; GFX6-NEXT: s_mov_b32 s5, 0x7ff00000 -; GFX6-NEXT: v_frexp_mant_f64_e32 v[2:3], v[0:1] -; GFX6-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX6-NEXT: s_setpc_b64 s[30:31] +; GFX6-SDAG-LABEL: test_frexp_f64_i32_only_use_fract: +; GFX6-SDAG: ; %bb.0: +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-SDAG-NEXT: s_mov_b32 s4, 0 +; GFX6-SDAG-NEXT: s_mov_b32 s5, 0x7ff00000 +; GFX6-SDAG-NEXT: v_frexp_mant_f64_e32 v[2:3], v[0:1] +; GFX6-SDAG-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_frexp_f64_i32_only_use_fract: ; GFX8: ; %bb.0: @@ -978,21 +979,32 @@ define double @test_frexp_f64_i32_only_use_fract(double %a) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_frexp_mant_f64_e32 v[0:1], v[0:1] ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-GISEL-LABEL: test_frexp_f64_i32_only_use_fract: +; GFX6-GISEL: ; %bb.0: +; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-GISEL-NEXT: v_mov_b32_e32 v3, 0x7ff00000 +; GFX6-GISEL-NEXT: v_frexp_mant_f64_e32 v[4:5], v[0:1] +; GFX6-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3] +; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] %result = call { double, i32 } @llvm.frexp.f64.i32(double %a) %result.0 = extractvalue { double, i32 } %result, 0 ret double %result.0 } define i32 @test_frexp_f64_i32_only_use_exp(double %a) { -; GFX6-LABEL: test_frexp_f64_i32_only_use_exp: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0 -; GFX6-NEXT: s_mov_b32 s5, 0x7ff00000 -; GFX6-NEXT: v_frexp_exp_i32_f64_e32 v2, v[0:1] -; GFX6-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc -; GFX6-NEXT: s_setpc_b64 s[30:31] +; GFX6-SDAG-LABEL: test_frexp_f64_i32_only_use_exp: +; GFX6-SDAG: ; %bb.0: +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-SDAG-NEXT: s_mov_b32 s4, 0 +; GFX6-SDAG-NEXT: s_mov_b32 s5, 0x7ff00000 +; GFX6-SDAG-NEXT: v_frexp_exp_i32_f64_e32 v2, v[0:1] +; GFX6-SDAG-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_frexp_f64_i32_only_use_exp: ; GFX8: ; %bb.0: @@ -1011,6 +1023,16 @@ define i32 @test_frexp_f64_i32_only_use_exp(double %a) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v0, v[0:1] ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-GISEL-LABEL: test_frexp_f64_i32_only_use_exp: +; GFX6-GISEL: ; %bb.0: +; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-GISEL-NEXT: v_mov_b32_e32 v3, 0x7ff00000 +; GFX6-GISEL-NEXT: v_frexp_exp_i32_f64_e32 v4, v[0:1] +; GFX6-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3] +; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc +; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] %result = call { double, i32 } @llvm.frexp.f64.i32(double %a) %result.0 = extractvalue { double, i32 } %result, 1 ret i32 %result.0 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index 438b1bfe319a0..2a1488652d887 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -4558,10 +4558,10 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX8-LABEL: constant_zextload_v16i1_to_v16i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: v_mov_b32_e32 v17, 0 -; GFX8-NEXT: v_mov_b32_e32 v21, 0 -; GFX8-NEXT: v_mov_b32_e32 v19, v17 -; GFX8-NEXT: v_mov_b32_e32 v13, v17 +; GFX8-NEXT: v_mov_b32_e32 v20, 0 +; GFX8-NEXT: v_mov_b32_e32 v19, 0 +; GFX8-NEXT: v_mov_b32_e32 v17, v20 +; GFX8-NEXT: v_mov_b32_e32 v22, v20 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -4571,61 +4571,62 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s4, s0, 0x50 ; GFX8-NEXT: s_addc_u32 s5, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v24, s3 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_mov_b32_e32 v23, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 64 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v9, v17 -; GFX8-NEXT: v_mov_b32_e32 v5, v17 -; GFX8-NEXT: v_mov_b32_e32 v22, 0 +; GFX8-NEXT: v_mov_b32_e32 v23, v20 +; GFX8-NEXT: v_mov_b32_e32 v13, v20 +; GFX8-NEXT: v_mov_b32_e32 v9, v20 +; GFX8-NEXT: v_mov_b32_e32 v5, v20 +; GFX8-NEXT: v_mov_b32_e32 v25, 0 ; GFX8-NEXT: v_mov_b32_e32 v15, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_mov_b32_e32 v7, 0 ; GFX8-NEXT: v_mov_b32_e32 v11, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b16_e32 v4, 10, v2 -; GFX8-NEXT: v_and_b32_e32 v18, 1, v4 +; GFX8-NEXT: v_and_b32_e32 v16, 1, v4 ; GFX8-NEXT: v_lshrrev_b16_e32 v4, 11, v2 ; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX8-NEXT: v_and_b32_e32 v20, 0xffff, v4 +; GFX8-NEXT: v_and_b32_e32 v18, 0xffff, v4 +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[16:19] ; GFX8-NEXT: v_lshrrev_b16_e32 v4, 14, v2 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[18:21] +; GFX8-NEXT: v_mov_b32_e32 v17, s3 +; GFX8-NEXT: v_mov_b32_e32 v16, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 64 ; GFX8-NEXT: v_mov_b32_e32 v0, 1 -; GFX8-NEXT: v_and_b32_e32 v16, 1, v4 -; GFX8-NEXT: v_lshrrev_b16_e32 v18, 15, v2 -; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[16:19] -; GFX8-NEXT: v_mov_b32_e32 v24, s3 -; GFX8-NEXT: v_and_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v19, 1, v4 +; GFX8-NEXT: v_lshrrev_b16_e32 v21, 15, v2 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[19:22] +; GFX8-NEXT: v_mov_b32_e32 v17, s3 +; GFX8-NEXT: v_and_b32_sdwa v19, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 9, v2 -; GFX8-NEXT: v_mov_b32_e32 v23, s2 +; GFX8-NEXT: v_mov_b32_e32 v16, s2 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_add_u32 s2, s0, 0x60 -; GFX8-NEXT: v_mov_b32_e32 v19, 0 -; GFX8-NEXT: v_and_b32_e32 v18, 0xffff, v0 +; GFX8-NEXT: v_mov_b32_e32 v22, 0 +; GFX8-NEXT: v_and_b32_e32 v21, 0xffff, v0 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[16:19] -; GFX8-NEXT: v_mov_b32_e32 v24, s3 -; GFX8-NEXT: v_mov_b32_e32 v23, s2 +; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[19:22] +; GFX8-NEXT: v_mov_b32_e32 v1, v20 +; GFX8-NEXT: v_mov_b32_e32 v19, s3 +; GFX8-NEXT: v_mov_b32_e32 v18, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v26, s3 +; GFX8-NEXT: v_mov_b32_e32 v21, s3 ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 12, v2 -; GFX8-NEXT: v_mov_b32_e32 v25, s2 +; GFX8-NEXT: v_mov_b32_e32 v20, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 32 -; GFX8-NEXT: v_and_b32_e32 v19, 1, v0 +; GFX8-NEXT: v_and_b32_e32 v22, 1, v0 ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 13, v2 -; GFX8-NEXT: v_mov_b32_e32 v20, v17 -; GFX8-NEXT: v_mov_b32_e32 v1, v17 ; GFX8-NEXT: v_mov_b32_e32 v17, s1 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_mov_b32_e32 v16, s0 -; GFX8-NEXT: v_and_b32_e32 v21, 0xffff, v0 +; GFX8-NEXT: v_and_b32_e32 v24, 0xffff, v0 ; GFX8-NEXT: s_add_u32 s0, s0, 16 ; GFX8-NEXT: v_lshrrev_b16_e32 v6, 7, v2 ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 6, v2 -; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[19:22] +; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[22:25] ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_lshrrev_b16_e32 v4, 2, v2 ; GFX8-NEXT: v_lshrrev_b16_e32 v8, 4, v2 @@ -4634,23 +4635,23 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX8-NEXT: v_lshrrev_b16_e32 v14, 3, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v2 ; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v2 -; GFX8-NEXT: v_and_b32_e32 v22, 1, v6 +; GFX8-NEXT: v_and_b32_e32 v24, 1, v6 ; GFX8-NEXT: v_mov_b32_e32 v19, s3 -; GFX8-NEXT: v_mov_b32_e32 v21, s1 +; GFX8-NEXT: v_mov_b32_e32 v23, s1 ; GFX8-NEXT: v_and_b32_e32 v10, 1, v10 ; GFX8-NEXT: v_and_b32_e32 v6, 1, v14 ; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX8-NEXT: v_and_b32_e32 v14, 0xffff, v22 +; GFX8-NEXT: v_and_b32_e32 v14, 0xffff, v24 ; GFX8-NEXT: v_mov_b32_e32 v18, s2 -; GFX8-NEXT: v_mov_b32_e32 v20, s0 +; GFX8-NEXT: v_mov_b32_e32 v22, s0 ; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX8-NEXT: v_and_b32_e32 v8, 1, v8 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX8-NEXT: flat_store_dwordx4 v[25:26], v[12:15] +; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[12:15] ; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[8:11] -; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[4:7] +; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[4:7] ; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[0:3] ; GFX8-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index e89c44d5b94a8..a078d89acf382 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -3583,190 +3583,191 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[0:1] -; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x50 -; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x60 -; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[0:1] -; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x70 -; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] -; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: flat_load_dwordx4 v[24:27], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s6, s2, 32 ; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s6 -; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] -; GCN-HSA-NEXT: flat_load_dwordx4 v[28:31], v[26:27] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 -; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: s_add_u32 s2, s2, 64 -; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s8, s2, 48 +; GCN-HSA-NEXT: s_addc_u32 s9, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s9 +; GCN-HSA-NEXT: s_add_u32 s10, s2, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s8 +; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[16:17] +; GCN-HSA-NEXT: s_addc_u32 s11, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s11 +; GCN-HSA-NEXT: s_add_u32 s10, s2, 0x50 +; GCN-HSA-NEXT: s_addc_u32 s11, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GCN-HSA-NEXT: s_add_u32 s10, s2, 0x60 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[4:5] +; GCN-HSA-NEXT: s_addc_u32 s11, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 +; GCN-HSA-NEXT: s_add_u32 s2, s2, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s4 -; GCN-HSA-NEXT: flat_load_dwordx4 v[24:27], v[24:25] +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 +; GCN-HSA-NEXT: flat_load_dwordx4 v[28:31], v[12:13] +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s6 +; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[12:13] +; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[14:15] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xe0 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s1 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v37, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v36, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v37, 16, v21 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GCN-HSA-NEXT: v_and_b32_e32 v36, 0xffff, v21 -; GCN-HSA-NEXT: v_and_b32_e32 v34, 0xffff, v20 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[34:37] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GCN-HSA-NEXT: v_and_b32_e32 v34, 0xffff, v25 +; GCN-HSA-NEXT: v_and_b32_e32 v32, 0xffff, v24 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v37, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v36, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xc0 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xf0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[32:35] +; GCN-HSA-NEXT: v_mov_b32_e32 v37, s3 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v36, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xd0 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xa0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v35, 16, v23 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GCN-HSA-NEXT: v_and_b32_e32 v34, 0xffff, v23 -; GCN-HSA-NEXT: v_and_b32_e32 v32, 0xffff, v22 ; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[32:35] +; GCN-HSA-NEXT: s_add_u32 s10, s0, 0xb0 +; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 +; GCN-HSA-NEXT: s_add_u32 s12, s0, 0x80 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v35, 16, v27 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GCN-HSA-NEXT: v_and_b32_e32 v34, 0xffff, v27 +; GCN-HSA-NEXT: v_and_b32_e32 v32, 0xffff, v26 +; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[32:35] +; GCN-HSA-NEXT: s_waitcnt vmcnt(7) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s12 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27] +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 +; GCN-HSA-NEXT: s_waitcnt vmcnt(7) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v8 +; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v9 +; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s10 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27] +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s11 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v11 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v11 +; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v10 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[24:27] ; GCN-HSA-NEXT: s_waitcnt vmcnt(8) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v23, 16, v17 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s8 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v21, 16, v16 -; GCN-HSA-NEXT: v_and_b32_e32 v22, 0xffff, v17 -; GCN-HSA-NEXT: v_and_b32_e32 v20, 0xffff, v16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v22, 16, v19 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v18 -; GCN-HSA-NEXT: v_and_b32_e32 v21, 0xffff, v19 -; GCN-HSA-NEXT: v_and_b32_e32 v19, 0xffff, v18 -; GCN-HSA-NEXT: v_mov_b32_e32 v35, s5 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v34, s4 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[19:22] -; GCN-HSA-NEXT: s_waitcnt vmcnt(9) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s7 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v13 -; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s6 -; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v15 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v14 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v15 -; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v14 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[15:18] -; GCN-HSA-NEXT: s_waitcnt vmcnt(10) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v8 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v9 -; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v9 -; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v8 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v11 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v10 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v11 -; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v10 -; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[12:15] -; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[16:19] -; GCN-HSA-NEXT: s_waitcnt vmcnt(9) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v5 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xffff, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xffff, v4 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[11:14] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v7 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xffff, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xffff, v0 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v7 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v6 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s6 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[8:11] +; GCN-HSA-NEXT: s_waitcnt vmcnt(8) +; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xffff, v28 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v29 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v28 +; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v29 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 +; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v31 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v30 +; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v31 +; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v30 +; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[7:10] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[15:18] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v30 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v29 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v28 -; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v3 -; GCN-HSA-NEXT: s_waitcnt vmcnt(10) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v22, 16, v27 -; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xffff, v30 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v29 -; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v28 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v28, 16, v24 -; GCN-HSA-NEXT: v_and_b32_e32 v21, 0xffff, v27 -; GCN-HSA-NEXT: v_and_b32_e32 v29, 0xffff, v25 -; GCN-HSA-NEXT: v_and_b32_e32 v27, 0xffff, v24 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xffff, v2 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[3:6] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v20 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[27:30] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v26 -; GCN-HSA-NEXT: v_and_b32_e32 v19, 0xffff, v26 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v21 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v21 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[19:22] +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v23 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v22 +; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v23 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v22 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: s_waitcnt vmcnt(12) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v18 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; GCN-HSA-NEXT: s_waitcnt vmcnt(12) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v15 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v17 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v16 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v17 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v16 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v12 +; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v15 +; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v13 +; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v12 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v31 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v31 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[15:18] +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[3:6] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v14 +; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v14 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14] +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xffff, v2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[7:10] +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v19 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v19 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v64i16_to_v64i32: @@ -4383,189 +4384,191 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_load_dwordx4 v[28:31], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x70 -; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x60 -; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x50 -; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: s_add_u32 s8, s2, 64 ; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] +; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s2, 64 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: s_addc_u32 s9, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s6, s2, 32 ; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s7 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s6 -; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] -; GCN-HSA-NEXT: flat_load_dwordx4 v[28:31], v[24:25] -; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s4 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] -; GCN-HSA-NEXT: flat_load_dwordx4 v[24:27], v[24:25] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: flat_load_dwordx4 v[24:27], v[8:9] +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s6 +; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[16:17] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xe0 ; GCN-HSA-NEXT: v_mov_b32_e32 v37, s1 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v36, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v35, 16, v21 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v33, 16, v20 -; GCN-HSA-NEXT: v_bfe_i32 v34, v21, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v32, v20, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v35, 16, v29 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v33, 16, v28 +; GCN-HSA-NEXT: v_bfe_i32 v34, v29, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v32, v28, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[32:35] +; GCN-HSA-NEXT: v_mov_b32_e32 v36, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v35, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v34, 16, v31 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v32, 16, v30 +; GCN-HSA-NEXT: v_bfe_i32 v33, v31, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v31, v30, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v35, 16, v23 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v33, 16, v22 -; GCN-HSA-NEXT: v_bfe_i32 v34, v23, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v32, v22, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[32:35] -; GCN-HSA-NEXT: v_mov_b32_e32 v37, s5 +; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[31:34] +; GCN-HSA-NEXT: s_waitcnt vmcnt(8) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 16, v20 ; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v32, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 16, v21 +; GCN-HSA-NEXT: v_bfe_i32 v30, v21, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v28, v20, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[35:36], v[28:31] ; GCN-HSA-NEXT: v_mov_b32_e32 v35, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v34, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 -; GCN-HSA-NEXT: v_mov_b32_e32 v36, s4 -; GCN-HSA-NEXT: s_waitcnt vmcnt(8) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 16, v17 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 16, v16 -; GCN-HSA-NEXT: v_bfe_i32 v22, v17, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v20, v16, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[20:23] ; GCN-HSA-NEXT: v_mov_b32_e32 v37, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v36, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 16, v19 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v20, 16, v18 -; GCN-HSA-NEXT: v_bfe_i32 v21, v19, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v19, v18, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 16, v23 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 16, v22 +; GCN-HSA-NEXT: v_bfe_i32 v30, v23, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v28, v22, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[19:22] +; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[28:31] ; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v32, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_waitcnt vmcnt(9) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 16, v13 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 16, v12 -; GCN-HSA-NEXT: v_bfe_i32 v18, v13, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v16, v12, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v39, s3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 16, v15 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 16, v14 -; GCN-HSA-NEXT: v_bfe_i32 v22, v15, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v20, v14, 0, 16 +; GCN-HSA-NEXT: s_waitcnt vmcnt(9) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 16, v13 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 16, v12 +; GCN-HSA-NEXT: v_bfe_i32 v22, v13, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v20, v12, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v38, s2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[16:19] -; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[20:23] -; GCN-HSA-NEXT: s_waitcnt vmcnt(10) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v9 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 16, v8 -; GCN-HSA-NEXT: v_bfe_i32 v17, v9, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v15, v8, 0, 16 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v11 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 16, v10 -; GCN-HSA-NEXT: v_bfe_i32 v13, v11, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v11, v10, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[15:18] -; GCN-HSA-NEXT: flat_store_dwordx4 v[38:39], v[11:14] -; GCN-HSA-NEXT: s_waitcnt vmcnt(8) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 16, v0 -; GCN-HSA-NEXT: v_bfe_i32 v17, v1, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v15, v0, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 16, v15 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 16, v14 +; GCN-HSA-NEXT: v_bfe_i32 v30, v15, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v28, v14, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[20:23] +; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[28:31] +; GCN-HSA-NEXT: s_waitcnt vmcnt(10) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v5 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v4 +; GCN-HSA-NEXT: v_bfe_i32 v14, v5, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v12, v4, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 16, v7 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 16, v6 +; GCN-HSA-NEXT: v_bfe_i32 v22, v7, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v20, v6, 0, 16 +; GCN-HSA-NEXT: s_waitcnt vmcnt(9) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v1 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v0 +; GCN-HSA-NEXT: v_bfe_i32 v6, v1, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v4, v0, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] +; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 16, v6 -; GCN-HSA-NEXT: v_bfe_i32 v9, v7, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v7, v6, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v5 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 16, v4 -; GCN-HSA-NEXT: v_bfe_i32 v13, v5, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v11, v4, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 16, v2 -; GCN-HSA-NEXT: v_bfe_i32 v5, v3, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v3, v2, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 16, v3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 16, v2 +; GCN-HSA-NEXT: v_bfe_i32 v22, v3, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v20, v2, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[3:6] -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v20, 16, v28 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[20:23] +; GCN-HSA-NEXT: s_waitcnt vmcnt(11) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v9 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v8 +; GCN-HSA-NEXT: v_bfe_i32 v2, v9, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v0, v8, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GCN-HSA-NEXT: v_bfe_i32 v19, v28, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v27 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v26 -; GCN-HSA-NEXT: v_bfe_i32 v2, v27, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v0, v26, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v28, 16, v25 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v26, 16, v24 -; GCN-HSA-NEXT: v_bfe_i32 v27, v25, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v25, v24, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[25:28] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v24 +; GCN-HSA-NEXT: v_bfe_i32 v12, v24, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v24, 16, v11 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 16, v10 +; GCN-HSA-NEXT: v_bfe_i32 v23, v11, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v21, v10, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 16, v29 -; GCN-HSA-NEXT: v_bfe_i32 v21, v29, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[21:24] +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[19:22] +; GCN-HSA-NEXT: s_waitcnt vmcnt(12) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v19 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v18 +; GCN-HSA-NEXT: v_bfe_i32 v2, v19, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v0, v18, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v20, 16, v17 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v16 +; GCN-HSA-NEXT: v_bfe_i32 v19, v17, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v17, v16, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v31 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 16, v30 -; GCN-HSA-NEXT: v_bfe_i32 v17, v31, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v15, v30, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[17:20] +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v25 +; GCN-HSA-NEXT: v_bfe_i32 v14, v25, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v27 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v26 +; GCN-HSA-NEXT: v_bfe_i32 v6, v27, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v4, v26, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[7:10] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v64i16_to_v64i32: @@ -6526,16 +6529,16 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, v15 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, v15 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -6543,65 +6546,65 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x50 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v1 ; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4 +; GCN-HSA-NEXT: s_waitcnt vmcnt(1) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v5 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x70 +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[16:19] ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v7 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 -; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v5 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s5 +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s1 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s4 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v7 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x60 -; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v10, v8 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xffff, v3 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[7:10] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v3 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, v8 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v4 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, v15 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s1 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[2:5] -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] -; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[12:15] -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v18, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[7:8], v[14:17] +; GCN-HSA-NEXT: v_mov_b32_e32 v11, v15 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, v15 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, v15 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[2:5] +; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[15:18] +; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[10:13] +; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[6:9] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v16i16_to_v16i64: @@ -7358,26 +7361,27 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: flat_load_dwordx4 v[2:5], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: flat_load_dwordx4 v[6:9], v[0:1] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: flat_load_dwordx4 v[5:8], v[4:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 48 -; GCN-HSA-NEXT: flat_load_dwordx4 v[10:13], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[14:17], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[9:12], v[9:10] +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s2 +; GCN-HSA-NEXT: flat_load_dwordx4 v[13:16], v[13:14] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 @@ -7392,124 +7396,123 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s14, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s14 ; GCN-HSA-NEXT: s_add_u32 s14, s0, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, v4 ; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, v4 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v5 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[18:21] -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s14 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[18:21] -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s10 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[17:20] +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s14 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v1 +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[17:20] +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s10 ; GCN-HSA-NEXT: s_waitcnt vmcnt(4) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v9 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v9 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[18:21] -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s12 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[18:21] -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s4 -; GCN-HSA-NEXT: s_waitcnt vmcnt(5) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v11 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v11 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[18:21] -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s6 -; GCN-HSA-NEXT: s_waitcnt vmcnt(5) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v8 +; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[17:20] -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s9 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s8 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v15 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v15 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s12 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v6 ; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[17:20] ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s4 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v12 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s1 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xe0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[17:20] -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, 0 +; GCN-HSA-NEXT: s_waitcnt vmcnt(5) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v10 ; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v10 +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[17:20] +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s6 +; GCN-HSA-NEXT: s_waitcnt vmcnt(5) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v16 +; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s9 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s8 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v14 +; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v14 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[17:20] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v16 -; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v16 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s4 -; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[9:12] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v23, 0xffff, v14 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v13 -; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v13 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s4 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v11 +; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v11 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xe0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, 0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v9 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[16:19] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v15 +; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v15 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v5 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xffff, v12 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v13 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[23:26] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[26:29] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v21, 16, v8 -; GCN-HSA-NEXT: v_and_b32_e32 v19, 0xffff, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v24, 16, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v22, 0xffff, v7 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[19:22] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[22:25] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[18:21] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v6 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v2 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[15:18] -; GCN-HSA-NEXT: v_mov_b32_e32 v6, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[5:8] -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[9:12] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[14:17] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v32i16_to_v32i64: diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll index ea1e784fe58e2..dcaf7664d5b5a 100644 --- a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll +++ b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll @@ -45,14 +45,14 @@ define i8 @flat_inst_valu_offset_11bit_max(ptr %p) { ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-SDAG-LABEL: flat_inst_valu_offset_11bit_max: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff, v0 -; GFX10-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-SDAG-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: flat_inst_valu_offset_11bit_max: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_inst_valu_offset_11bit_max: ; GFX11: ; %bb.0: @@ -60,18 +60,6 @@ define i8 @flat_inst_valu_offset_11bit_max(ptr %p) { ; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:2047 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-GISEL-LABEL: flat_inst_valu_offset_11bit_max: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b64 s[4:5], 0x7ff -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX10-GISEL-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 2047 %load = load i8, ptr %gep, align 4 ret i8 %load @@ -85,14 +73,14 @@ define i8 @flat_inst_valu_offset_12bit_max(ptr %p) { ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-SDAG-LABEL: flat_inst_valu_offset_12bit_max: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, v0 -; GFX10-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-SDAG-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: flat_inst_valu_offset_12bit_max: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_inst_valu_offset_12bit_max: ; GFX11: ; %bb.0: @@ -100,18 +88,6 @@ define i8 @flat_inst_valu_offset_12bit_max(ptr %p) { ; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:4095 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-GISEL-LABEL: flat_inst_valu_offset_12bit_max: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b64 s[4:5], 0xfff -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX10-GISEL-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 4095 %load = load i8, ptr %gep, align 4 ret i8 %load @@ -127,14 +103,14 @@ define i8 @flat_inst_valu_offset_13bit_max(ptr %p) { ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-SDAG-LABEL: flat_inst_valu_offset_13bit_max: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, v0 -; GFX10-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-SDAG-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: flat_inst_valu_offset_13bit_max: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: flat_inst_valu_offset_13bit_max: ; GFX11-SDAG: ; %bb.0: @@ -148,36 +124,17 @@ define i8 @flat_inst_valu_offset_13bit_max(ptr %p) { ; GFX9-GISEL-LABEL: flat_inst_valu_offset_13bit_max: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], 0x1fff -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0x1fff, v0 +; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-GISEL-LABEL: flat_inst_valu_offset_13bit_max: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b64 s[4:5], 0x1fff -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX10-GISEL-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; ; GFX11-GISEL-LABEL: flat_inst_valu_offset_13bit_max: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b64 s[0:1], 0x1fff -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, v0 +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -187,610 +144,320 @@ define i8 @flat_inst_valu_offset_13bit_max(ptr %p) { } define i8 @flat_inst_valu_offset_neg_11bit_max(ptr %p) { -; GFX9-SDAG-LABEL: flat_inst_valu_offset_neg_11bit_max: +; GFX9-LABEL: flat_inst_valu_offset_neg_11bit_max: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: flat_inst_valu_offset_neg_11bit_max: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_inst_valu_offset_neg_11bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: flat_load_u8 v0, v[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i8, ptr %p, i64 -2048 + %load = load i8, ptr %gep, align 4 + ret i8 %load +} + +define i8 @flat_inst_valu_offset_neg_12bit_max(ptr %p) { +; GFX9-LABEL: flat_inst_valu_offset_neg_12bit_max: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: flat_inst_valu_offset_neg_12bit_max: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff000, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_inst_valu_offset_neg_12bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: flat_load_u8 v0, v[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i8, ptr %p, i64 -4096 + %load = load i8, ptr %gep, align 4 + ret i8 %load +} + +define i8 @flat_inst_valu_offset_neg_13bit_max(ptr %p) { +; GFX9-LABEL: flat_inst_valu_offset_neg_13bit_max: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: flat_inst_valu_offset_neg_13bit_max: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_inst_valu_offset_neg_13bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: flat_load_u8 v0, v[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i8, ptr %p, i64 -8192 + %load = load i8, ptr %gep, align 4 + ret i8 %load +} + +define i8 @flat_inst_valu_offset_2x_11bit_max(ptr %p) { +; GFX9-LABEL: flat_inst_valu_offset_2x_11bit_max: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: flat_inst_valu_offset_2x_11bit_max: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_inst_valu_offset_2x_11bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:4095 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i8, ptr %p, i64 4095 + %load = load i8, ptr %gep, align 4 + ret i8 %load +} + +define i8 @flat_inst_valu_offset_2x_12bit_max(ptr %p) { +; GFX9-SDAG-LABEL: flat_inst_valu_offset_2x_12bit_max: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 +; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-SDAG-LABEL: flat_inst_valu_offset_neg_11bit_max: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 -; GFX10-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-SDAG-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: flat_inst_valu_offset_2x_12bit_max: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: flat_inst_valu_offset_neg_11bit_max: +; GFX11-SDAG-LABEL: flat_inst_valu_offset_2x_12bit_max: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 -; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] +; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0 +; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095 ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-GISEL-LABEL: flat_inst_valu_offset_neg_11bit_max: +; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_12bit_max: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_movk_i32 s4, 0xf800 -; GFX9-GISEL-NEXT: s_mov_b32 s5, -1 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0x1fff, v0 +; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-GISEL-LABEL: flat_inst_valu_offset_neg_11bit_max: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_movk_i32 s4, 0xf800 -; GFX10-GISEL-NEXT: s_mov_b32 s5, -1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX10-GISEL-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: flat_inst_valu_offset_neg_11bit_max: +; GFX11-GISEL-LABEL: flat_inst_valu_offset_2x_12bit_max: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_movk_i32 s0, 0xf800 -; GFX11-GISEL-NEXT: s_mov_b32 s1, -1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, v0 +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 -2048 + %gep = getelementptr i8, ptr %p, i64 8191 %load = load i8, ptr %gep, align 4 ret i8 %load } -define i8 @flat_inst_valu_offset_neg_12bit_max(ptr %p) { -; GFX9-SDAG-LABEL: flat_inst_valu_offset_neg_12bit_max: +define i8 @flat_inst_valu_offset_2x_13bit_max(ptr %p) { +; GFX9-SDAG-LABEL: flat_inst_valu_offset_2x_13bit_max: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0 -; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v0 +; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-SDAG-LABEL: flat_inst_valu_offset_neg_12bit_max: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff000, v0 -; GFX10-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-SDAG-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: flat_inst_valu_offset_2x_13bit_max: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x3fff, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: flat_inst_valu_offset_neg_12bit_max: +; GFX11-SDAG-LABEL: flat_inst_valu_offset_2x_13bit_max: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff000, v0 -; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] +; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x3000, v0 +; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095 ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-GISEL-LABEL: flat_inst_valu_offset_neg_12bit_max: +; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_13bit_max: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_movk_i32 s4, 0xf000 -; GFX9-GISEL-NEXT: s_mov_b32 s5, -1 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0x3fff, v0 +; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-GISEL-LABEL: flat_inst_valu_offset_neg_12bit_max: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_movk_i32 s4, 0xf000 -; GFX10-GISEL-NEXT: s_mov_b32 s5, -1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX10-GISEL-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: flat_inst_valu_offset_neg_12bit_max: +; GFX11-GISEL-LABEL: flat_inst_valu_offset_2x_13bit_max: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_movk_i32 s0, 0xf000 -; GFX11-GISEL-NEXT: s_mov_b32 s1, -1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x3fff, v0 +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 -4096 - %load = load i8, ptr %gep, align 4 - ret i8 %load -} - -define i8 @flat_inst_valu_offset_neg_13bit_max(ptr %p) { -; GFX9-SDAG-LABEL: flat_inst_valu_offset_neg_13bit_max: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 -; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-SDAG-LABEL: flat_inst_valu_offset_neg_13bit_max: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0 -; GFX10-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-SDAG-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: flat_inst_valu_offset_neg_13bit_max: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0 -; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-GISEL-LABEL: flat_inst_valu_offset_neg_13bit_max: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_movk_i32 s4, 0xe000 -; GFX9-GISEL-NEXT: s_mov_b32 s5, -1 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-GISEL-LABEL: flat_inst_valu_offset_neg_13bit_max: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_movk_i32 s4, 0xe000 -; GFX10-GISEL-NEXT: s_mov_b32 s5, -1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX10-GISEL-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: flat_inst_valu_offset_neg_13bit_max: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_movk_i32 s0, 0xe000 -; GFX11-GISEL-NEXT: s_mov_b32 s1, -1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 -8192 + %gep = getelementptr i8, ptr %p, i64 16383 %load = load i8, ptr %gep, align 4 ret i8 %load } -define i8 @flat_inst_valu_offset_2x_11bit_max(ptr %p) { -; GFX9-LABEL: flat_inst_valu_offset_2x_11bit_max: +define i8 @flat_inst_valu_offset_2x_neg_11bit_max(ptr %p) { +; GFX9-LABEL: flat_inst_valu_offset_2x_neg_11bit_max: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-SDAG-LABEL: flat_inst_valu_offset_2x_11bit_max: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, v0 -; GFX10-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-SDAG-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: flat_inst_valu_offset_2x_neg_11bit_max: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff000, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_inst_valu_offset_2x_11bit_max: +; GFX11-LABEL: flat_inst_valu_offset_2x_neg_11bit_max: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:4095 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: flat_load_u8 v0, v[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-GISEL-LABEL: flat_inst_valu_offset_2x_11bit_max: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b64 s[4:5], 0xfff -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX10-GISEL-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 4095 - %load = load i8, ptr %gep, align 4 - ret i8 %load -} - -define i8 @flat_inst_valu_offset_2x_12bit_max(ptr %p) { -; GFX9-SDAG-LABEL: flat_inst_valu_offset_2x_12bit_max: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 -; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-SDAG-LABEL: flat_inst_valu_offset_2x_12bit_max: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, v0 -; GFX10-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-SDAG-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: flat_inst_valu_offset_2x_12bit_max: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0 -; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095 -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_12bit_max: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], 0x1fff -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-GISEL-LABEL: flat_inst_valu_offset_2x_12bit_max: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b64 s[4:5], 0x1fff -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX10-GISEL-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: flat_inst_valu_offset_2x_12bit_max: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b64 s[0:1], 0x1fff -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 8191 - %load = load i8, ptr %gep, align 4 - ret i8 %load -} - -define i8 @flat_inst_valu_offset_2x_13bit_max(ptr %p) { -; GFX9-SDAG-LABEL: flat_inst_valu_offset_2x_13bit_max: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v0 -; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-SDAG-LABEL: flat_inst_valu_offset_2x_13bit_max: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x3fff, v0 -; GFX10-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-SDAG-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: flat_inst_valu_offset_2x_13bit_max: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x3000, v0 -; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095 -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_13bit_max: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], 0x3fff -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-GISEL-LABEL: flat_inst_valu_offset_2x_13bit_max: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b64 s[4:5], 0x3fff -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX10-GISEL-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: flat_inst_valu_offset_2x_13bit_max: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b64 s[0:1], 0x3fff -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 16383 - %load = load i8, ptr %gep, align 4 - ret i8 %load -} - -define i8 @flat_inst_valu_offset_2x_neg_11bit_max(ptr %p) { -; GFX9-SDAG-LABEL: flat_inst_valu_offset_2x_neg_11bit_max: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0 -; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-SDAG-LABEL: flat_inst_valu_offset_2x_neg_11bit_max: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff000, v0 -; GFX10-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-SDAG-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: flat_inst_valu_offset_2x_neg_11bit_max: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff000, v0 -; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_neg_11bit_max: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_movk_i32 s4, 0xf000 -; GFX9-GISEL-NEXT: s_mov_b32 s5, -1 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-GISEL-LABEL: flat_inst_valu_offset_2x_neg_11bit_max: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_movk_i32 s4, 0xf000 -; GFX10-GISEL-NEXT: s_mov_b32 s5, -1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX10-GISEL-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: flat_inst_valu_offset_2x_neg_11bit_max: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_movk_i32 s0, 0xf000 -; GFX11-GISEL-NEXT: s_mov_b32 s1, -1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -4096 %load = load i8, ptr %gep, align 4 ret i8 %load } define i8 @flat_inst_valu_offset_2x_neg_12bit_max(ptr %p) { -; GFX9-SDAG-LABEL: flat_inst_valu_offset_2x_neg_12bit_max: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 -; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-SDAG-LABEL: flat_inst_valu_offset_2x_neg_12bit_max: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0 -; GFX10-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-SDAG-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: flat_inst_valu_offset_2x_neg_12bit_max: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0 -; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_neg_12bit_max: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_movk_i32 s4, 0xe000 -; GFX9-GISEL-NEXT: s_mov_b32 s5, -1 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: flat_inst_valu_offset_2x_neg_12bit_max: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-GISEL-LABEL: flat_inst_valu_offset_2x_neg_12bit_max: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_movk_i32 s4, 0xe000 -; GFX10-GISEL-NEXT: s_mov_b32 s5, -1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX10-GISEL-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: flat_inst_valu_offset_2x_neg_12bit_max: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-GISEL-LABEL: flat_inst_valu_offset_2x_neg_12bit_max: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_movk_i32 s0, 0xe000 -; GFX11-GISEL-NEXT: s_mov_b32 s1, -1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: flat_inst_valu_offset_2x_neg_12bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: flat_load_u8 v0, v[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -8192 %load = load i8, ptr %gep, align 4 ret i8 %load } define i8 @flat_inst_valu_offset_2x_neg_13bit_max(ptr %p) { -; GFX9-SDAG-LABEL: flat_inst_valu_offset_2x_neg_13bit_max: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffc000, v0 -; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-SDAG-LABEL: flat_inst_valu_offset_2x_neg_13bit_max: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffc000, v0 -; GFX10-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-SDAG-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: flat_inst_valu_offset_2x_neg_13bit_max: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffc000, v0 -; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_neg_13bit_max: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_movk_i32 s4, 0xc000 -; GFX9-GISEL-NEXT: s_mov_b32 s5, -1 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: flat_inst_valu_offset_2x_neg_13bit_max: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffc000, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-GISEL-LABEL: flat_inst_valu_offset_2x_neg_13bit_max: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_movk_i32 s4, 0xc000 -; GFX10-GISEL-NEXT: s_mov_b32 s5, -1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX10-GISEL-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: flat_inst_valu_offset_2x_neg_13bit_max: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffc000, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-GISEL-LABEL: flat_inst_valu_offset_2x_neg_13bit_max: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_movk_i32 s0, 0xc000 -; GFX11-GISEL-NEXT: s_mov_b32 s1, -1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: flat_inst_valu_offset_2x_neg_13bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffc000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: flat_load_u8 v0, v[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -16384 %load = load i8, ptr %gep, align 4 ret i8 %load diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll index c9c6c0912bda9..137c83a0fd80c 100644 --- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll +++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll @@ -72,11 +72,8 @@ define i8 @global_inst_valu_offset_12bit_max(ptr addrspace(1) %p) { ; GFX10-GISEL-LABEL: global_inst_valu_offset_12bit_max: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b64 s[4:5], 0xfff -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, v0 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -105,11 +102,8 @@ define i8 @global_inst_valu_offset_13bit_max(ptr addrspace(1) %p) { ; GFX9-GISEL-LABEL: global_inst_valu_offset_13bit_max: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], 0x1fff -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0x1fff, v0 +; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -117,11 +111,8 @@ define i8 @global_inst_valu_offset_13bit_max(ptr addrspace(1) %p) { ; GFX10-GISEL-LABEL: global_inst_valu_offset_13bit_max: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b64 s[4:5], 0x1fff -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, v0 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -129,12 +120,8 @@ define i8 @global_inst_valu_offset_13bit_max(ptr addrspace(1) %p) { ; GFX11-GISEL-LABEL: global_inst_valu_offset_13bit_max: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b64 s[0:1], 0x1fff -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, v0 +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -204,18 +191,14 @@ define i8 @global_inst_valu_offset_neg_12bit_max(ptr addrspace(1) %p) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-GISEL-LABEL: global_inst_valu_offset_neg_12bit_max: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_movk_i32 s4, 0xf000 -; GFX10-GISEL-NEXT: s_mov_b32 s5, -1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: global_inst_valu_offset_neg_12bit_max: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff000, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_inst_valu_offset_neg_12bit_max: ; GFX11: ; %bb.0: @@ -223,87 +206,38 @@ define i8 @global_inst_valu_offset_neg_12bit_max(ptr addrspace(1) %p) { ; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-4096 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-SDAG-LABEL: global_inst_valu_offset_neg_12bit_max: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff000, v0 -; GFX10-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -4096 %load = load i8, ptr addrspace(1) %gep, align 4 ret i8 %load } define i8 @global_inst_valu_offset_neg_13bit_max(ptr addrspace(1) %p) { -; GFX9-GISEL-LABEL: global_inst_valu_offset_neg_13bit_max: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_movk_i32 s4, 0xe000 -; GFX9-GISEL-NEXT: s_mov_b32 s5, -1 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-GISEL-LABEL: global_inst_valu_offset_neg_13bit_max: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_movk_i32 s4, 0xe000 -; GFX10-GISEL-NEXT: s_mov_b32 s5, -1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: global_inst_valu_offset_neg_13bit_max: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_movk_i32 s0, 0xe000 -; GFX11-GISEL-NEXT: s_mov_b32 s1, -1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-SDAG-LABEL: global_inst_valu_offset_neg_13bit_max: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 -; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX9-SDAG-NEXT: global_load_ubyte v0, v[0:1], off -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: global_inst_valu_offset_neg_13bit_max: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-SDAG-LABEL: global_inst_valu_offset_neg_13bit_max: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0 -; GFX10-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: global_inst_valu_offset_neg_13bit_max: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: global_inst_valu_offset_neg_13bit_max: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0 -; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: global_inst_valu_offset_neg_13bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: global_load_u8 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -8192 %load = load i8, ptr addrspace(1) %gep, align 4 ret i8 %load @@ -320,11 +254,8 @@ define i8 @global_inst_valu_offset_2x_11bit_max(ptr addrspace(1) %p) { ; GFX10-GISEL-LABEL: global_inst_valu_offset_2x_11bit_max: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b64 s[4:5], 0xfff -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, v0 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -353,11 +284,8 @@ define i8 @global_inst_valu_offset_2x_12bit_max(ptr addrspace(1) %p) { ; GFX9-GISEL-LABEL: global_inst_valu_offset_2x_12bit_max: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], 0x1fff -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0x1fff, v0 +; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -365,11 +293,8 @@ define i8 @global_inst_valu_offset_2x_12bit_max(ptr addrspace(1) %p) { ; GFX10-GISEL-LABEL: global_inst_valu_offset_2x_12bit_max: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b64 s[4:5], 0x1fff -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, v0 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -377,12 +302,8 @@ define i8 @global_inst_valu_offset_2x_12bit_max(ptr addrspace(1) %p) { ; GFX11-GISEL-LABEL: global_inst_valu_offset_2x_12bit_max: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b64 s[0:1], 0x1fff -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, v0 +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -422,11 +343,8 @@ define i8 @global_inst_valu_offset_2x_13bit_max(ptr addrspace(1) %p) { ; GFX9-GISEL-LABEL: global_inst_valu_offset_2x_13bit_max: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], 0x3fff -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0x3fff, v0 +; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -434,11 +352,8 @@ define i8 @global_inst_valu_offset_2x_13bit_max(ptr addrspace(1) %p) { ; GFX10-GISEL-LABEL: global_inst_valu_offset_2x_13bit_max: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b64 s[4:5], 0x3fff -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x3fff, v0 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -446,12 +361,8 @@ define i8 @global_inst_valu_offset_2x_13bit_max(ptr addrspace(1) %p) { ; GFX11-GISEL-LABEL: global_inst_valu_offset_2x_13bit_max: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b64 s[0:1], 0x3fff -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x3fff, v0 +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -495,18 +406,14 @@ define i8 @global_inst_valu_offset_2x_neg_11bit_max(ptr addrspace(1) %p) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-GISEL-LABEL: global_inst_valu_offset_2x_neg_11bit_max: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_movk_i32 s4, 0xf000 -; GFX10-GISEL-NEXT: s_mov_b32 s5, -1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: global_inst_valu_offset_2x_neg_11bit_max: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff000, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_inst_valu_offset_2x_neg_11bit_max: ; GFX11: ; %bb.0: @@ -514,159 +421,70 @@ define i8 @global_inst_valu_offset_2x_neg_11bit_max(ptr addrspace(1) %p) { ; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-4096 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-SDAG-LABEL: global_inst_valu_offset_2x_neg_11bit_max: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff000, v0 -; GFX10-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -4096 %load = load i8, ptr addrspace(1) %gep, align 4 ret i8 %load } define i8 @global_inst_valu_offset_2x_neg_12bit_max(ptr addrspace(1) %p) { -; GFX9-GISEL-LABEL: global_inst_valu_offset_2x_neg_12bit_max: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_movk_i32 s4, 0xe000 -; GFX9-GISEL-NEXT: s_mov_b32 s5, -1 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-GISEL-LABEL: global_inst_valu_offset_2x_neg_12bit_max: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_movk_i32 s4, 0xe000 -; GFX10-GISEL-NEXT: s_mov_b32 s5, -1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: global_inst_valu_offset_2x_neg_12bit_max: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_movk_i32 s0, 0xe000 -; GFX11-GISEL-NEXT: s_mov_b32 s1, -1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-SDAG-LABEL: global_inst_valu_offset_2x_neg_12bit_max: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 -; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX9-SDAG-NEXT: global_load_ubyte v0, v[0:1], off -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: global_inst_valu_offset_2x_neg_12bit_max: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-SDAG-LABEL: global_inst_valu_offset_2x_neg_12bit_max: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0 -; GFX10-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: global_inst_valu_offset_2x_neg_12bit_max: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: global_inst_valu_offset_2x_neg_12bit_max: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0 -; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: global_inst_valu_offset_2x_neg_12bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: global_load_u8 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -8192 %load = load i8, ptr addrspace(1) %gep, align 4 ret i8 %load } define i8 @global_inst_valu_offset_2x_neg_13bit_max(ptr addrspace(1) %p) { -; GFX9-GISEL-LABEL: global_inst_valu_offset_2x_neg_13bit_max: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_movk_i32 s4, 0xc000 -; GFX9-GISEL-NEXT: s_mov_b32 s5, -1 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-GISEL-LABEL: global_inst_valu_offset_2x_neg_13bit_max: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_movk_i32 s4, 0xc000 -; GFX10-GISEL-NEXT: s_mov_b32 s5, -1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: global_inst_valu_offset_2x_neg_13bit_max: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_movk_i32 s0, 0xc000 -; GFX11-GISEL-NEXT: s_mov_b32 s1, -1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-SDAG-LABEL: global_inst_valu_offset_2x_neg_13bit_max: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffc000, v0 -; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX9-SDAG-NEXT: global_load_ubyte v0, v[0:1], off -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: global_inst_valu_offset_2x_neg_13bit_max: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffc000, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-SDAG-LABEL: global_inst_valu_offset_2x_neg_13bit_max: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffc000, v0 -; GFX10-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: global_inst_valu_offset_2x_neg_13bit_max: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffc000, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: global_inst_valu_offset_2x_neg_13bit_max: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffc000, v0 -; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: global_inst_valu_offset_2x_neg_13bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffc000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: global_load_u8 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -16384 %load = load i8, ptr addrspace(1) %gep, align 4 ret i8 %load diff --git a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll index ca79772dbed74..536b2d054272e 100644 --- a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll +++ b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll @@ -3065,33 +3065,31 @@ define i64 @v_mul_284_add_82_i64(i64 %arg) { ; GFX7-LABEL: v_mul_284_add_82_i64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, v1 -; GFX7-NEXT: v_mov_b32_e32 v3, 0x52 -; GFX7-NEXT: v_mov_b32_e32 v4, 0 -; GFX7-NEXT: s_movk_i32 s6, 0x11c -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, s6, v[3:4] -; GFX7-NEXT: v_mul_lo_u32 v2, v2, s6 -; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GFX7-NEXT: s_movk_i32 s4, 0x11c +; GFX7-NEXT: v_mul_lo_u32 v3, v1, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, 0x52 +; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, s4, v[1:2] +; GFX7-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_284_add_82_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, 0x52 -; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: s_movk_i32 s6, 0x11c -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, s6, v[3:4] -; GFX8-NEXT: v_mul_lo_u32 v2, v2, s6 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: s_movk_i32 s4, 0x11c +; GFX8-NEXT: v_mul_lo_u32 v3, v1, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, 0x52 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, s4, v[1:2] +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_mul_284_add_82_i64: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v3, 0x52 -; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: s_movk_i32 s6, 0x11c +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, s6, v[3:4] ; GFX900-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, s6, v[1:2] @@ -3101,8 +3099,8 @@ define i64 @v_mul_284_add_82_i64(i64 %arg) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, 0x52 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: s_movk_i32 s6, 0x11c +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, s6, v[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v4, v1 @@ -3113,9 +3111,9 @@ define i64 @v_mul_284_add_82_i64(i64 %arg) { ; GFX10-LABEL: v_mul_284_add_82_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b64 s[4:5], 0x52 +; GFX10-NEXT: s_movk_i32 s4, 0x11c ; GFX10-NEXT: v_mov_b32_e32 v2, v1 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, 0x11c, v0, s[4:5] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, v0, s4, 0x52 ; GFX10-NEXT: v_mad_u64_u32 v[1:2], null, 0x11c, v2, v[1:2] ; GFX10-NEXT: s_setpc_b64 s[30:31] %mul = mul i64 %arg, 284 @@ -3139,33 +3137,31 @@ define i64 @v_mul_934584645_add_8234599_i64(i64 %arg) { ; GFX7-LABEL: v_mul_934584645_add_8234599_i64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, v1 -; GFX7-NEXT: v_mov_b32_e32 v3, 0x7da667 -; GFX7-NEXT: v_mov_b32_e32 v4, 0 -; GFX7-NEXT: s_mov_b32 s6, 0x37b4a145 -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, s6, v[3:4] -; GFX7-NEXT: v_mul_lo_u32 v2, v2, s6 -; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GFX7-NEXT: s_mov_b32 s4, 0x37b4a145 +; GFX7-NEXT: v_mul_lo_u32 v3, v1, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, 0x7da667 +; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, s4, v[1:2] +; GFX7-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_934584645_add_8234599_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, 0x7da667 -; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: s_mov_b32 s6, 0x37b4a145 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, s6, v[3:4] -; GFX8-NEXT: v_mul_lo_u32 v2, v2, s6 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: s_mov_b32 s4, 0x37b4a145 +; GFX8-NEXT: v_mul_lo_u32 v3, v1, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, 0x7da667 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, s4, v[1:2] +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_mul_934584645_add_8234599_i64: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v3, 0x7da667 -; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: s_mov_b32 s6, 0x37b4a145 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, s6, v[3:4] ; GFX900-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, s6, v[1:2] @@ -3175,8 +3171,8 @@ define i64 @v_mul_934584645_add_8234599_i64(i64 %arg) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, 0x7da667 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: s_mov_b32 s6, 0x37b4a145 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, s6, v[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v4, v1 @@ -3187,9 +3183,9 @@ define i64 @v_mul_934584645_add_8234599_i64(i64 %arg) { ; GFX10-LABEL: v_mul_934584645_add_8234599_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b64 s[4:5], 0x7da667 +; GFX10-NEXT: s_mov_b32 s4, 0x37b4a145 ; GFX10-NEXT: v_mov_b32_e32 v2, v1 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, 0x37b4a145, v0, s[4:5] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, v0, s4, 0x7da667 ; GFX10-NEXT: v_mad_u64_u32 v[1:2], null, 0x37b4a145, v2, v[1:2] ; GFX10-NEXT: s_setpc_b64 s[30:31] %mul = mul i64 %arg, 934584645 diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll index 0654d55576645..3dc565ceed0d0 100644 --- a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll @@ -59,10 +59,8 @@ define amdgpu_ps <2 x i32> @s_rsq_f64(double inreg %x) { ; ; SI-GISEL-LABEL: s_rsq_f64: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_mov_b32 s2, 0 -; SI-GISEL-NEXT: s_brev_b32 s3, 8 -; SI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; SI-GISEL-NEXT: v_bfrev_b32_e32 v1, 8 ; SI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc @@ -145,10 +143,8 @@ define amdgpu_ps <2 x i32> @s_rsq_f64(double inreg %x) { ; ; VI-GISEL-LABEL: s_rsq_f64: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_mov_b32 s2, 0 -; VI-GISEL-NEXT: s_brev_b32 s3, 8 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; VI-GISEL-NEXT: v_bfrev_b32_e32 v1, 8 ; VI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc @@ -243,11 +239,9 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_fabs(double inreg %x) { ; ; SI-GISEL-LABEL: s_rsq_f64_fabs: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_mov_b32 s2, 0 -; SI-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; SI-GISEL-NEXT: s_brev_b32 s3, 8 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; SI-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[2:3] +; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; SI-GISEL-NEXT: v_bfrev_b32_e32 v1, 8 +; SI-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |s[0:1]|, v[0:1] ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc ; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], |s[0:1]|, v0 @@ -329,11 +323,9 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_fabs(double inreg %x) { ; ; VI-GISEL-LABEL: s_rsq_f64_fabs: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_mov_b32 s2, 0 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; VI-GISEL-NEXT: s_brev_b32 s3, 8 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; VI-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; VI-GISEL-NEXT: v_bfrev_b32_e32 v1, 8 +; VI-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |s[0:1]|, v[0:1] ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc ; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], |s[0:1]|, v0 @@ -428,10 +420,8 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_f64(double inreg %x) { ; ; SI-GISEL-LABEL: s_neg_rsq_f64: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_mov_b32 s2, 0 -; SI-GISEL-NEXT: s_brev_b32 s3, 8 -; SI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; SI-GISEL-NEXT: v_bfrev_b32_e32 v1, 8 ; SI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc @@ -514,10 +504,8 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_f64(double inreg %x) { ; ; VI-GISEL-LABEL: s_neg_rsq_f64: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_mov_b32 s2, 0 -; VI-GISEL-NEXT: s_brev_b32 s3, 8 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; VI-GISEL-NEXT: v_bfrev_b32_e32 v1, 8 ; VI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc @@ -612,11 +600,9 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_neg_f64(double inreg %x) { ; ; SI-GISEL-LABEL: s_neg_rsq_neg_f64: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_mov_b32 s2, 0 -; SI-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; SI-GISEL-NEXT: s_brev_b32 s3, 8 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; SI-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, -v[0:1], s[2:3] +; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; SI-GISEL-NEXT: v_bfrev_b32_e32 v1, 8 +; SI-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, -s[0:1], v[0:1] ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc ; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], -s[0:1], v0 @@ -698,11 +684,9 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_neg_f64(double inreg %x) { ; ; VI-GISEL-LABEL: s_neg_rsq_neg_f64: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_mov_b32 s2, 0 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; VI-GISEL-NEXT: s_brev_b32 s3, 8 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; VI-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, -v[0:1], s[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; VI-GISEL-NEXT: v_bfrev_b32_e32 v1, 8 +; VI-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, -s[0:1], v[0:1] ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc ; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], -s[0:1], v0 @@ -797,11 +781,11 @@ define double @v_rsq_f64(double %x) { ; SI-GISEL-LABEL: v_rsq_f64: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: s_mov_b32 s4, 0 -; SI-GISEL-NEXT: s_brev_b32 s5, 8 -; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; SI-GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; SI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 ; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] @@ -879,11 +863,11 @@ define double @v_rsq_f64(double %x) { ; VI-GISEL-LABEL: v_rsq_f64: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: s_mov_b32 s4, 0 -; VI-GISEL-NEXT: s_brev_b32 s5, 8 -; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; VI-GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; VI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 @@ -966,11 +950,11 @@ define double @v_rsq_f64_fabs(double %x) { ; SI-GISEL-LABEL: v_rsq_f64_fabs: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: s_mov_b32 s4, 0 -; SI-GISEL-NEXT: s_brev_b32 s5, 8 -; SI-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; SI-GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; SI-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3] +; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2 ; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 ; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] @@ -1048,11 +1032,11 @@ define double @v_rsq_f64_fabs(double %x) { ; VI-GISEL-LABEL: v_rsq_f64_fabs: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: s_mov_b32 s4, 0 -; VI-GISEL-NEXT: s_brev_b32 s5, 8 -; VI-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; VI-GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; VI-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2 ; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 @@ -1136,11 +1120,11 @@ define double @v_rsq_f64_missing_contract0(double %x) { ; SI-GISEL-LABEL: v_rsq_f64_missing_contract0: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: s_mov_b32 s4, 0 -; SI-GISEL-NEXT: s_brev_b32 s5, 8 -; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; SI-GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; SI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 ; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] @@ -1218,11 +1202,11 @@ define double @v_rsq_f64_missing_contract0(double %x) { ; VI-GISEL-LABEL: v_rsq_f64_missing_contract0: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: s_mov_b32 s4, 0 -; VI-GISEL-NEXT: s_brev_b32 s5, 8 -; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; VI-GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; VI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 @@ -1305,11 +1289,11 @@ define double @v_rsq_f64_missing_contract1(double %x) { ; SI-GISEL-LABEL: v_rsq_f64_missing_contract1: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: s_mov_b32 s4, 0 -; SI-GISEL-NEXT: s_brev_b32 s5, 8 -; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; SI-GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; SI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 ; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] @@ -1387,11 +1371,11 @@ define double @v_rsq_f64_missing_contract1(double %x) { ; VI-GISEL-LABEL: v_rsq_f64_missing_contract1: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: s_mov_b32 s4, 0 -; VI-GISEL-NEXT: s_brev_b32 s5, 8 -; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; VI-GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; VI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 @@ -1474,11 +1458,11 @@ define double @v_neg_rsq_f64(double %x) { ; SI-GISEL-LABEL: v_neg_rsq_f64: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: s_mov_b32 s4, 0 -; SI-GISEL-NEXT: s_brev_b32 s5, 8 -; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; SI-GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; SI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 ; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] @@ -1556,11 +1540,11 @@ define double @v_neg_rsq_f64(double %x) { ; VI-GISEL-LABEL: v_neg_rsq_f64: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: s_mov_b32 s4, 0 -; VI-GISEL-NEXT: s_brev_b32 s5, 8 -; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; VI-GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; VI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 @@ -1680,24 +1664,26 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) { ; SI-GISEL-NEXT: s_mov_b32 s4, 0 ; SI-GISEL-NEXT: s_brev_b32 s5, 8 ; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x100 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v10, vcc +; SI-GISEL-NEXT: v_mov_b32_e32 v12, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v12, vcc ; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 -; SI-GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3] +; SI-GISEL-NEXT: v_mov_b32_e32 v10, s4 ; SI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v11, s5 +; SI-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11] ; SI-GISEL-NEXT: v_mov_b32_e32 v14, 0xffffff80 -; SI-GISEL-NEXT: v_mov_b32_e32 v15, 0x260 -; SI-GISEL-NEXT: v_mov_b32_e32 v20, 0x3ff00000 ; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5 ; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] +; SI-GISEL-NEXT: v_mov_b32_e32 v15, 0x260 ; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5 +; SI-GISEL-NEXT: v_mov_b32_e32 v20, 0x3ff00000 ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] ; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] ; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1] ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] ; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1] ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] -; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v10, s[4:5] +; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v12, s[4:5] ; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v6 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v14, vcc ; SI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] @@ -1825,12 +1811,14 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) { ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-GISEL-NEXT: s_mov_b32 s4, 0 ; VI-GISEL-NEXT: s_brev_b32 s5, 8 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s5 ; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; VI-GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3] -; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc -; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v5 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5] +; VI-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5] +; VI-GISEL-NEXT: v_mov_b32_e32 v6, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc +; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v7 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v6, s[4:5] ; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 ; VI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] ; VI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] @@ -1855,15 +1843,15 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) { ; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7] ; VI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc -; VI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] ; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] ; VI-GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9 ; VI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10 ; VI-GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] ; VI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], 1.0 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] ; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] ; VI-GISEL-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0 ; VI-GISEL-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0 @@ -1977,24 +1965,26 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) { ; SI-GISEL-NEXT: s_mov_b32 s4, 0 ; SI-GISEL-NEXT: s_brev_b32 s5, 8 ; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x100 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v10, vcc +; SI-GISEL-NEXT: v_mov_b32_e32 v12, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v12, vcc ; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 -; SI-GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3] +; SI-GISEL-NEXT: v_mov_b32_e32 v10, s4 ; SI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v11, s5 +; SI-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11] ; SI-GISEL-NEXT: v_mov_b32_e32 v14, 0xffffff80 -; SI-GISEL-NEXT: v_mov_b32_e32 v15, 0x260 -; SI-GISEL-NEXT: v_mov_b32_e32 v20, 0xbff00000 ; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5 ; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] +; SI-GISEL-NEXT: v_mov_b32_e32 v15, 0x260 ; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5 +; SI-GISEL-NEXT: v_mov_b32_e32 v20, 0xbff00000 ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] ; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] ; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1] ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] ; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1] ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] -; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v10, s[4:5] +; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v12, s[4:5] ; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v6 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v14, vcc ; SI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] @@ -2122,12 +2112,14 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) { ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-GISEL-NEXT: s_mov_b32 s4, 0 ; VI-GISEL-NEXT: s_brev_b32 s5, 8 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s5 ; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; VI-GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3] -; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc -; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v5 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5] +; VI-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5] +; VI-GISEL-NEXT: v_mov_b32_e32 v6, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc +; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v7 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v6, s[4:5] ; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 ; VI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] ; VI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] @@ -2152,15 +2144,15 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) { ; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7] ; VI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc -; VI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] ; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] ; VI-GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9 ; VI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10 ; VI-GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] ; VI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], -1.0 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] ; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] ; VI-GISEL-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], -1.0 ; VI-GISEL-NEXT: v_div_scale_f64 v[16:17], s[4:5], -1.0, v[2:3], -1.0 @@ -2242,15 +2234,17 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) { ; SI-GISEL-NEXT: s_mov_b32 s4, 0 ; SI-GISEL-NEXT: s_brev_b32 s5, 8 ; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x100 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v10, vcc +; SI-GISEL-NEXT: v_mov_b32_e32 v12, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v12, vcc ; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 -; SI-GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3] +; SI-GISEL-NEXT: v_mov_b32_e32 v10, s4 ; SI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v11, s5 +; SI-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11] ; SI-GISEL-NEXT: v_mov_b32_e32 v14, 0xffffff80 -; SI-GISEL-NEXT: v_mov_b32_e32 v15, 0x260 ; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5 ; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] +; SI-GISEL-NEXT: v_mov_b32_e32 v15, 0x260 ; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5 ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] ; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] @@ -2258,7 +2252,7 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) { ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] ; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1] ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] -; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v10, s[4:5] +; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v12, s[4:5] ; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v6 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v14, vcc ; SI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] @@ -2358,12 +2352,14 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) { ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-GISEL-NEXT: s_mov_b32 s4, 0 ; VI-GISEL-NEXT: s_brev_b32 s5, 8 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s5 ; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; VI-GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3] -; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc -; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v5 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5] +; VI-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5] +; VI-GISEL-NEXT: v_mov_b32_e32 v6, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc +; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v7 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v6, s[4:5] ; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 ; VI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] ; VI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] @@ -2388,15 +2384,15 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) { ; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7] ; VI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc -; VI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] ; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] ; VI-GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9 ; VI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10 ; VI-GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] ; VI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], -1.0 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] ; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] ; VI-GISEL-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], s[4:5] ; VI-GISEL-NEXT: v_div_scale_f64 v[16:17], s[4:5], s[4:5], v[2:3], s[4:5] @@ -2511,15 +2507,17 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) { ; SI-GISEL-NEXT: s_mov_b32 s4, 0 ; SI-GISEL-NEXT: s_brev_b32 s5, 8 ; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x100 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v10, vcc +; SI-GISEL-NEXT: v_mov_b32_e32 v12, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v12, vcc ; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 -; SI-GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3] +; SI-GISEL-NEXT: v_mov_b32_e32 v10, s4 ; SI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v11, s5 +; SI-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11] ; SI-GISEL-NEXT: v_mov_b32_e32 v14, 0xffffff80 -; SI-GISEL-NEXT: v_mov_b32_e32 v15, 0x260 ; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5 ; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] +; SI-GISEL-NEXT: v_mov_b32_e32 v15, 0x260 ; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5 ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] ; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] @@ -2527,7 +2525,7 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) { ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] ; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1] ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] -; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v10, s[4:5] +; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v12, s[4:5] ; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v6 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v14, vcc ; SI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] @@ -2657,12 +2655,14 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) { ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-GISEL-NEXT: s_mov_b32 s4, 0 ; VI-GISEL-NEXT: s_brev_b32 s5, 8 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s5 ; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; VI-GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3] -; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc -; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v5 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5] +; VI-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5] +; VI-GISEL-NEXT: v_mov_b32_e32 v6, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc +; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v7 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v6, s[4:5] ; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 ; VI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] ; VI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] @@ -2687,15 +2687,15 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) { ; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7] ; VI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc -; VI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] ; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] ; VI-GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9 ; VI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10 ; VI-GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] ; VI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], -1.0 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] ; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] ; VI-GISEL-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0 ; VI-GISEL-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0 @@ -2772,11 +2772,11 @@ define double @v_rsq_f64_fneg_fabs(double %x) { ; SI-GISEL-LABEL: v_rsq_f64_fneg_fabs: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: s_mov_b32 s4, 0 -; SI-GISEL-NEXT: s_brev_b32 s5, 8 -; SI-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, -|v[0:1]|, s[4:5] -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; SI-GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; SI-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, -|v[0:1]|, v[2:3] +; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], -|v[0:1]|, v2 ; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 ; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] @@ -2854,11 +2854,11 @@ define double @v_rsq_f64_fneg_fabs(double %x) { ; VI-GISEL-LABEL: v_rsq_f64_fneg_fabs: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: s_mov_b32 s4, 0 -; VI-GISEL-NEXT: s_brev_b32 s5, 8 -; VI-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, -|v[0:1]|, s[4:5] -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; VI-GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; VI-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, -|v[0:1]|, v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], -|v[0:1]|, v2 ; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 @@ -2943,11 +2943,11 @@ define double @v_rsq_f64__afn_sqrt(double %x) { ; SI-GISEL-LABEL: v_rsq_f64__afn_sqrt: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: s_mov_b32 s4, 0 -; SI-GISEL-NEXT: s_brev_b32 s5, 8 -; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; SI-GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; SI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 ; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] @@ -3025,11 +3025,11 @@ define double @v_rsq_f64__afn_sqrt(double %x) { ; VI-GISEL-LABEL: v_rsq_f64__afn_sqrt: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: s_mov_b32 s4, 0 -; VI-GISEL-NEXT: s_brev_b32 s5, 8 -; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; VI-GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; VI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 @@ -3104,11 +3104,11 @@ define double @v_rsq_f64__afn_fdiv(double %x) { ; SI-GISEL-LABEL: v_rsq_f64__afn_fdiv: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: s_mov_b32 s4, 0 -; SI-GISEL-NEXT: s_brev_b32 s5, 8 -; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; SI-GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; SI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 ; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] @@ -3174,11 +3174,11 @@ define double @v_rsq_f64__afn_fdiv(double %x) { ; VI-GISEL-LABEL: v_rsq_f64__afn_fdiv: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: s_mov_b32 s4, 0 -; VI-GISEL-NEXT: s_brev_b32 s5, 8 -; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; VI-GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; VI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 @@ -3249,11 +3249,11 @@ define double @v_rsq_f64__afn(double %x) { ; SI-GISEL-LABEL: v_rsq_f64__afn: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: s_mov_b32 s4, 0 -; SI-GISEL-NEXT: s_brev_b32 s5, 8 -; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; SI-GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; SI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 ; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] @@ -3319,11 +3319,11 @@ define double @v_rsq_f64__afn(double %x) { ; VI-GISEL-LABEL: v_rsq_f64__afn: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: s_mov_b32 s4, 0 -; VI-GISEL-NEXT: s_brev_b32 s5, 8 -; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; VI-GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; VI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 @@ -3395,11 +3395,11 @@ define double @v_neg_rsq_f64__afn(double %x) { ; SI-GISEL-LABEL: v_neg_rsq_f64__afn: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: s_mov_b32 s4, 0 -; SI-GISEL-NEXT: s_brev_b32 s5, 8 -; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; SI-GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; SI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 ; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] @@ -3467,11 +3467,11 @@ define double @v_neg_rsq_f64__afn(double %x) { ; VI-GISEL-LABEL: v_neg_rsq_f64__afn: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: s_mov_b32 s4, 0 -; VI-GISEL-NEXT: s_brev_b32 s5, 8 -; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; VI-GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; VI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 @@ -3543,11 +3543,11 @@ define double @v_rsq_f64__afn_ninf(double %x) { ; SI-GISEL-LABEL: v_rsq_f64__afn_ninf: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: s_mov_b32 s4, 0 -; SI-GISEL-NEXT: s_brev_b32 s5, 8 -; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; SI-GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; SI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 ; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] @@ -3613,11 +3613,11 @@ define double @v_rsq_f64__afn_ninf(double %x) { ; VI-GISEL-LABEL: v_rsq_f64__afn_ninf: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: s_mov_b32 s4, 0 -; VI-GISEL-NEXT: s_brev_b32 s5, 8 -; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; VI-GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; VI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 @@ -3688,11 +3688,11 @@ define double @v_rsq_f64__afn_nnan(double %x) { ; SI-GISEL-LABEL: v_rsq_f64__afn_nnan: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: s_mov_b32 s4, 0 -; SI-GISEL-NEXT: s_brev_b32 s5, 8 -; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; SI-GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; SI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 ; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] @@ -3758,11 +3758,11 @@ define double @v_rsq_f64__afn_nnan(double %x) { ; VI-GISEL-LABEL: v_rsq_f64__afn_nnan: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: s_mov_b32 s4, 0 -; VI-GISEL-NEXT: s_brev_b32 s5, 8 -; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; VI-GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; VI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 @@ -3833,11 +3833,11 @@ define double @v_rsq_f64__afn_nnan_ninf(double %x) { ; SI-GISEL-LABEL: v_rsq_f64__afn_nnan_ninf: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: s_mov_b32 s4, 0 -; SI-GISEL-NEXT: s_brev_b32 s5, 8 -; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; SI-GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; SI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 ; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] @@ -3903,11 +3903,11 @@ define double @v_rsq_f64__afn_nnan_ninf(double %x) { ; VI-GISEL-LABEL: v_rsq_f64__afn_nnan_ninf: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: s_mov_b32 s4, 0 -; VI-GISEL-NEXT: s_brev_b32 s5, 8 -; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; VI-GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; VI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 @@ -3979,11 +3979,11 @@ define double @v_neg_rsq_f64__afn_nnan_ninf(double %x) { ; SI-GISEL-LABEL: v_neg_rsq_f64__afn_nnan_ninf: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: s_mov_b32 s4, 0 -; SI-GISEL-NEXT: s_brev_b32 s5, 8 -; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; SI-GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; SI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 ; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] @@ -4051,11 +4051,11 @@ define double @v_neg_rsq_f64__afn_nnan_ninf(double %x) { ; VI-GISEL-LABEL: v_neg_rsq_f64__afn_nnan_ninf: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: s_mov_b32 s4, 0 -; VI-GISEL-NEXT: s_brev_b32 s5, 8 -; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; VI-GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; VI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 @@ -4135,11 +4135,11 @@ define double @v_rsq_f64__nnan_ninf(double %x) { ; SI-GISEL-LABEL: v_rsq_f64__nnan_ninf: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: s_mov_b32 s4, 0 -; SI-GISEL-NEXT: s_brev_b32 s5, 8 -; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; SI-GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; SI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 ; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] @@ -4217,11 +4217,11 @@ define double @v_rsq_f64__nnan_ninf(double %x) { ; VI-GISEL-LABEL: v_rsq_f64__nnan_ninf: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: s_mov_b32 s4, 0 -; VI-GISEL-NEXT: s_brev_b32 s5, 8 -; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; VI-GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; VI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 @@ -4325,13 +4325,13 @@ define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) { ; SI-GISEL-NEXT: s_mov_b32 s4, 0 ; SI-GISEL-NEXT: s_brev_b32 s5, 8 ; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x100 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v10, vcc +; SI-GISEL-NEXT: v_mov_b32_e32 v12, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v12, vcc ; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 -; SI-GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3] +; SI-GISEL-NEXT: v_mov_b32_e32 v10, s4 ; SI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] -; SI-GISEL-NEXT: v_mov_b32_e32 v12, 0xffffff80 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v12, vcc +; SI-GISEL-NEXT: v_mov_b32_e32 v11, s5 +; SI-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11] ; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5 ; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] ; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5 @@ -4339,28 +4339,30 @@ define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) { ; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] ; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1] ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] -; SI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v10, s[4:5] +; SI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v12, s[4:5] ; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v8 ; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1] ; SI-GISEL-NEXT: v_rsq_f64_e32 v[10:11], v[2:3] ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] -; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v13 +; SI-GISEL-NEXT: v_mov_b32_e32 v12, 0xffffff80 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v12, vcc ; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[10:11], 0.5 ; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[2:3], v[10:11] -; SI-GISEL-NEXT: v_mov_b32_e32 v13, 0x260 +; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v13 ; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 0.5 -; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v13 +; SI-GISEL-NEXT: v_mov_b32_e32 v13, 0x260 ; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] ; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] ; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3] -; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v13 ; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[10:11], v[6:7], v[8:9] -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3] -; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v13 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[10:11], v[6:7], v[8:9] ; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v12, s[4:5] ; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v13 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[0:1] @@ -4445,12 +4447,14 @@ define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) { ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-GISEL-NEXT: s_mov_b32 s4, 0 ; VI-GISEL-NEXT: s_brev_b32 s5, 8 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s5 ; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; VI-GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3] -; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc -; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5] -; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v5 +; VI-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5] +; VI-GISEL-NEXT: v_mov_b32_e32 v6, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v6, s[4:5] +; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v7 ; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 ; VI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] ; VI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] @@ -4543,10 +4547,8 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_unsafe(double inreg %x) #0 { ; ; SI-GISEL-LABEL: s_rsq_f64_unsafe: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_mov_b32 s2, 0 -; SI-GISEL-NEXT: s_brev_b32 s3, 8 -; SI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; SI-GISEL-NEXT: v_bfrev_b32_e32 v1, 8 ; SI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc @@ -4617,10 +4619,8 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_unsafe(double inreg %x) #0 { ; ; VI-GISEL-LABEL: s_rsq_f64_unsafe: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_mov_b32 s2, 0 -; VI-GISEL-NEXT: s_brev_b32 s3, 8 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; VI-GISEL-NEXT: v_bfrev_b32_e32 v1, 8 ; VI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc @@ -4703,11 +4703,11 @@ define double @v_rsq_f64_unsafe(double %x) #0 { ; SI-GISEL-LABEL: v_rsq_f64_unsafe: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: s_mov_b32 s4, 0 -; SI-GISEL-NEXT: s_brev_b32 s5, 8 -; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; SI-GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; SI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 ; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] @@ -4773,11 +4773,11 @@ define double @v_rsq_f64_unsafe(double %x) #0 { ; VI-GISEL-LABEL: v_rsq_f64_unsafe: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: s_mov_b32 s4, 0 -; VI-GISEL-NEXT: s_brev_b32 s5, 8 -; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; VI-GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; VI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 @@ -5109,14 +5109,15 @@ define double @v_div_contract_sqrt_f64(double %x, double %y) { ; SI-GISEL-LABEL: v_div_contract_sqrt_f64: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: s_mov_b32 s4, 0 -; SI-GISEL-NEXT: s_brev_b32 s5, 8 -; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] -; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; SI-GISEL-NEXT: v_bfrev_b32_e32 v5, 8 +; SI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5] +; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v6, vcc ; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 ; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0xffffff80 ; SI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] +; SI-GISEL-NEXT: v_mov_b32_e32 v11, 0x260 ; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5 ; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], v[4:5] ; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5 @@ -5128,8 +5129,7 @@ define double @v_div_contract_sqrt_f64(double %x, double %y) { ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] ; SI-GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v10, vcc ; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 -; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x260 -; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v6 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v11 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; SI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] @@ -5190,11 +5190,11 @@ define double @v_div_contract_sqrt_f64(double %x, double %y) { ; VI-GISEL-LABEL: v_div_contract_sqrt_f64: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: s_mov_b32 s4, 0 -; VI-GISEL-NEXT: s_brev_b32 s5, 8 -; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] -; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; VI-GISEL-NEXT: v_bfrev_b32_e32 v5, 8 +; VI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5] +; VI-GISEL-NEXT: v_mov_b32_e32 v6, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v6, vcc ; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 ; VI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] ; VI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5 @@ -5276,14 +5276,15 @@ define double @v_div_arcp_sqrt_f64(double %x, double %y) { ; SI-GISEL-LABEL: v_div_arcp_sqrt_f64: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: s_mov_b32 s4, 0 -; SI-GISEL-NEXT: s_brev_b32 s5, 8 -; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] -; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; SI-GISEL-NEXT: v_bfrev_b32_e32 v5, 8 +; SI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5] +; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v6, vcc ; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 ; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0xffffff80 ; SI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] +; SI-GISEL-NEXT: v_mov_b32_e32 v11, 0x260 ; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5 ; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], v[4:5] ; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5 @@ -5295,8 +5296,7 @@ define double @v_div_arcp_sqrt_f64(double %x, double %y) { ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] ; SI-GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v10, vcc ; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 -; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x260 -; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v6 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v11 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; SI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] @@ -5357,11 +5357,11 @@ define double @v_div_arcp_sqrt_f64(double %x, double %y) { ; VI-GISEL-LABEL: v_div_arcp_sqrt_f64: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: s_mov_b32 s4, 0 -; VI-GISEL-NEXT: s_brev_b32 s5, 8 -; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] -; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; VI-GISEL-NEXT: v_bfrev_b32_e32 v5, 8 +; VI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5] +; VI-GISEL-NEXT: v_mov_b32_e32 v6, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v6, vcc ; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 ; VI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] ; VI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5 @@ -5443,14 +5443,15 @@ define double @v_div_contract_arcp_sqrt_f64(double %x, double %y) { ; SI-GISEL-LABEL: v_div_contract_arcp_sqrt_f64: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: s_mov_b32 s4, 0 -; SI-GISEL-NEXT: s_brev_b32 s5, 8 -; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] -; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; SI-GISEL-NEXT: v_bfrev_b32_e32 v5, 8 +; SI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5] +; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v6, vcc ; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 ; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0xffffff80 ; SI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] +; SI-GISEL-NEXT: v_mov_b32_e32 v11, 0x260 ; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5 ; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], v[4:5] ; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5 @@ -5462,8 +5463,7 @@ define double @v_div_contract_arcp_sqrt_f64(double %x, double %y) { ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] ; SI-GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v10, vcc ; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 -; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x260 -; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v6 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v11 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; SI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] @@ -5524,11 +5524,11 @@ define double @v_div_contract_arcp_sqrt_f64(double %x, double %y) { ; VI-GISEL-LABEL: v_div_contract_arcp_sqrt_f64: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: s_mov_b32 s4, 0 -; VI-GISEL-NEXT: s_brev_b32 s5, 8 -; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] -; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; VI-GISEL-NEXT: v_bfrev_b32_e32 v5, 8 +; VI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5] +; VI-GISEL-NEXT: v_mov_b32_e32 v6, 0x100 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v6, vcc ; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 ; VI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] ; VI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5 @@ -5568,16 +5568,18 @@ define double @v_div_const_contract_sqrt_f64(double %x) { ; SI-SDAG-LABEL: v_div_const_contract_sqrt_f64: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: s_brev_b32 s7, 8 -; SI-SDAG-NEXT: s_mov_b32 s6, 0 -; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[6:7], v[0:1] +; SI-SDAG-NEXT: s_mov_b32 s4, 0 +; SI-SDAG-NEXT: s_brev_b32 s5, 8 +; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] ; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 ; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 ; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: s_mov_b32 s7, 0x40700000 +; SI-SDAG-NEXT: s_mov_b32 s8, 0x40700000 ; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] ; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 ; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 @@ -5600,7 +5602,7 @@ define double @v_div_const_contract_sqrt_f64(double %x) { ; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], s[6:7], v[0:1], s[6:7] ; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 ; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] -; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s7, v7 +; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s8, v7 ; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] ; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc ; SI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] @@ -5611,19 +5613,20 @@ define double @v_div_const_contract_sqrt_f64(double %x) { ; SI-GISEL-LABEL: v_div_const_contract_sqrt_f64: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: s_mov_b32 s6, 0 -; SI-GISEL-NEXT: s_brev_b32 s7, 8 -; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[6:7], v[0:1] -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; SI-GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; SI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 ; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-GISEL-NEXT: s_mov_b32 s6, 0 ; SI-GISEL-NEXT: s_mov_b32 s7, 0x40700000 -; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x40700000 ; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 ; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x40700000 ; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 ; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] @@ -5655,9 +5658,10 @@ define double @v_div_const_contract_sqrt_f64(double %x) { ; VI-SDAG-LABEL: v_div_const_contract_sqrt_f64: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SDAG-NEXT: s_brev_b32 s5, 8 ; VI-SDAG-NEXT: s_mov_b32 s4, 0 +; VI-SDAG-NEXT: s_brev_b32 s5, 8 ; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; VI-SDAG-NEXT: s_mov_b32 s4, 0 ; VI-SDAG-NEXT: s_mov_b32 s5, 0x40700000 ; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 @@ -5695,12 +5699,13 @@ define double @v_div_const_contract_sqrt_f64(double %x) { ; VI-GISEL-LABEL: v_div_const_contract_sqrt_f64: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; VI-GISEL-NEXT: v_bfrev_b32_e32 v3, 8 +; VI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100 ; VI-GISEL-NEXT: s_mov_b32 s4, 0 -; VI-GISEL-NEXT: s_brev_b32 s5, 8 -; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100 ; VI-GISEL-NEXT: s_mov_b32 s5, 0x40700000 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 diff --git a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll index 7027521d7e2dc..714b2af1698fe 100644 --- a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll +++ b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll @@ -171,10 +171,10 @@ entry: ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x8: ; GCN-NOHSA-DAG: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x9a40{{$}} ; CI-NOHSA-NOT: v_add -; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16 +; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16 ; CI-NOHSA-DAG: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x9a50{{$}} ; CI-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}} -; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}} +; GCN-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}} ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} @@ -201,11 +201,11 @@ entry: ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x16: -; SI: s_mov_b32 {{s[0-9]+}}, 0x13480 +; SI-DAG: s_mov_b64 s[{{[0-9:]+}}], 0x13480 ; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16 ; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:32 ; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:48 -; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], {{s[0-9]+}} addr64 +; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], {{s[0-9]+}} addr64 ; CI-NOHSA-DAG: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x13480{{$}} ; CI-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}} ; CI-NOHSA-DAG: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x13490{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index 4f2fd3f50494c..9cb6842ae0a18 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -1592,31 +1592,30 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_addc_u32_e64 v6, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[5:6] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[5:6] ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0x8000 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[5:6] ; GCN-IR-NEXT: v_cndmask_b32_e64 v7, v7, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v3, v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 -; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB12_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v5 -; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc ; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v5 +; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc +; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[9:10] -; GCN-IR-NEXT: v_lshl_b64 v[4:5], s[8:9], v4 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], s[4:5], v4 ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execz .LBB12_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v0 -; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v1, vcc ; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v9 ; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 47, v8 diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll index be0aa394dd99d..9cd13b3d45150 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.ll @@ -1284,18 +1284,15 @@ define amdgpu_kernel void @v_shl_i64_32_bit_constant(ptr addrspace(1) %out, ptr ; VI-LABEL: v_shl_i64_32_bit_constant: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[2:3], 0x0 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_mov_b64 s[0:1], 0x12d687 +; VI-NEXT: s_load_dword s4, s[2:3], 0x0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_lshl_b64 s[4:5], 0x12d687, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_shl_i64_32_bit_constant: @@ -1875,30 +1872,28 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_4_0_i64(ptr addrspace(1) %out, p define amdgpu_kernel void @s_shl_inline_imm_f32_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_f32_4_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s2, s[0:1], 0xd -; SI-NEXT: s_mov_b64 s[0:1], 0x40800000 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_lshl_b64 s[4:5], 0x40800000, s4 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_shl_inline_imm_f32_4_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s2, s[0:1], 0x34 -; VI-NEXT: s_mov_b64 s[0:1], 0x40800000 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_lshl_b64 s[4:5], 0x40800000, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_shl_inline_imm_f32_4_0_i64: @@ -1931,10 +1926,10 @@ define amdgpu_kernel void @s_shl_inline_imm_f32_neg_4_0_i64(ptr addrspace(1) %ou ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_load_dword s2, s[0:1], 0xd -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s0, -4.0 -; SI-NEXT: s_mov_b32 s1, s6 +; SI-NEXT: s_mov_b32 s1, -1 ; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 ; SI-NEXT: v_mov_b32_e32 v0, s0 @@ -1946,10 +1941,10 @@ define amdgpu_kernel void @s_shl_inline_imm_f32_neg_4_0_i64(ptr addrspace(1) %ou ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; VI-NEXT: s_load_dword s2, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s0, -4.0 -; VI-NEXT: s_mov_b32 s1, s6 +; VI-NEXT: s_mov_b32 s1, -1 ; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll index 033fd8ef89cfe..536f3e77e67d9 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -16,2668 +16,2662 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b32 s44, SCRATCH_RSRC_DWORD0 -; GFX6-NEXT: s_mov_b32 s45, SCRATCH_RSRC_DWORD1 -; GFX6-NEXT: s_mov_b32 s46, -1 -; GFX6-NEXT: s_mov_b32 s47, 0xe8f000 -; GFX6-NEXT: s_add_u32 s44, s44, s3 +; GFX6-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX6-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX6-NEXT: s_mov_b32 s42, -1 +; GFX6-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX6-NEXT: s_add_u32 s40, s40, s3 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 13, v0 -; GFX6-NEXT: s_mov_b32 s18, 0 -; GFX6-NEXT: s_mov_b32 s19, 0xf000 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v5 -; GFX6-NEXT: v_mov_b32_e32 v1, s3 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: s_movk_i32 s4, 0x80 -; GFX6-NEXT: s_mov_b32 s5, s18 -; GFX6-NEXT: s_mov_b64 s[6:7], s[18:19] -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[4:7], 0 addr64 offset:3968 -; GFX6-NEXT: s_addc_u32 s45, s45, 0 -; GFX6-NEXT: s_movk_i32 s8, 0x100 -; GFX6-NEXT: s_mov_b32 s9, s18 -; GFX6-NEXT: s_mov_b64 s[10:11], s[18:19] -; GFX6-NEXT: s_movk_i32 s12, 0x180 -; GFX6-NEXT: s_mov_b32 s13, s18 -; GFX6-NEXT: s_mov_b64 s[14:15], s[18:19] -; GFX6-NEXT: s_movk_i32 s20, 0x200 -; GFX6-NEXT: s_mov_b32 s21, s18 -; GFX6-NEXT: s_mov_b64 s[22:23], s[18:19] -; GFX6-NEXT: s_movk_i32 s24, 0x280 -; GFX6-NEXT: s_mov_b32 s25, s18 -; GFX6-NEXT: s_mov_b64 s[26:27], s[18:19] -; GFX6-NEXT: s_movk_i32 s28, 0x300 -; GFX6-NEXT: s_mov_b32 s29, s18 -; GFX6-NEXT: s_mov_b64 s[30:31], s[18:19] -; GFX6-NEXT: s_movk_i32 s36, 0x380 -; GFX6-NEXT: s_mov_b32 s37, s18 -; GFX6-NEXT: s_mov_b64 s[38:39], s[18:19] -; GFX6-NEXT: s_movk_i32 s40, 0x400 -; GFX6-NEXT: s_mov_b32 s41, s18 -; GFX6-NEXT: s_mov_b64 s[42:43], s[18:19] -; GFX6-NEXT: s_mov_b64 s[16:17], s[2:3] +; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_mov_b32_e32 v6, 0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v7, vcc, s2, v5 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v0, vcc +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 +; GFX6-NEXT: s_addc_u32 s41, s41, 0 ; GFX6-NEXT: s_mov_b32 s2, 0x3fd00 +; GFX6-NEXT: s_mov_b64 s[8:9], 0x100 +; GFX6-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX6-NEXT: s_mov_b64 s[12:13], 0x180 +; GFX6-NEXT: s_mov_b64 s[14:15], s[6:7] +; GFX6-NEXT: s_mov_b64 s[16:17], 0x200 +; GFX6-NEXT: s_mov_b64 s[18:19], s[6:7] +; GFX6-NEXT: s_mov_b64 s[20:21], 0x280 +; GFX6-NEXT: s_mov_b64 s[22:23], s[6:7] +; GFX6-NEXT: s_mov_b64 s[24:25], 0x300 +; GFX6-NEXT: s_mov_b64 s[26:27], s[6:7] +; GFX6-NEXT: s_mov_b64 s[28:29], 0x380 +; GFX6-NEXT: s_mov_b64 s[30:31], s[6:7] +; GFX6-NEXT: s_mov_b64 s[36:37], 0x400 +; GFX6-NEXT: s_mov_b64 s[38:39], s[6:7] +; GFX6-NEXT: s_mov_b32 s33, 0x4f900 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1268 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:4 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1272 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1276 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1280 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:16 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[4:7], 0 addr64 offset:3984 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1300 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:20 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1304 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1308 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1312 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:24 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:28 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:32 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[4:7], 0 addr64 offset:4000 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:32 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1332 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:36 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1336 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1340 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1344 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:40 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:44 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:48 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[4:7], 0 addr64 offset:4016 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:48 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1364 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:52 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1368 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1372 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1376 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:56 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:60 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:64 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[4:7], 0 addr64 offset:4032 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:64 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1396 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:68 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1400 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1404 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1408 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:72 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:76 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:80 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[4:7], 0 addr64 offset:4048 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:80 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1428 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:84 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1432 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1436 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1440 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:88 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:92 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:96 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[4:7], 0 addr64 offset:4064 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:96 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1460 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:100 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1464 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1468 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1472 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:104 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:108 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:112 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[4:7], 0 addr64 offset:4080 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:112 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1492 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:116 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1496 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1500 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1504 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:120 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:124 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:128 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:3968 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:128 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1556 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:132 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1560 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1564 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1568 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:136 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:140 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:144 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:3984 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:144 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1588 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:148 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1592 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1596 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1600 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:152 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:156 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:160 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:4000 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:160 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1620 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:164 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1624 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1628 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1632 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:168 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:172 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:176 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:4016 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:176 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1652 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:180 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1656 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1660 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1664 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:184 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:188 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:192 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:4032 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:192 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1684 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:196 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1688 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1692 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1696 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:200 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:204 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:208 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:4048 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:208 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1716 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:212 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1720 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1724 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1728 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:216 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:220 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:224 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:4064 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:224 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1748 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:228 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1752 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1756 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1760 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:232 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:236 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:240 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:4080 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:240 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1780 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:244 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1784 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1788 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1792 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:248 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:252 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:256 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[12:15], 0 addr64 offset:3968 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:256 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1860 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:260 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1864 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1868 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1872 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:264 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:268 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:272 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[12:15], 0 addr64 offset:3984 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:272 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1892 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:276 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1896 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1900 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1904 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:280 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:284 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:288 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[12:15], 0 addr64 offset:4000 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:288 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1924 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:292 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1928 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1932 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1936 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:296 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:300 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:304 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[12:15], 0 addr64 offset:4016 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:304 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1956 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:308 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1960 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1964 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:1968 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:312 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:316 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:320 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[12:15], 0 addr64 offset:4032 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:320 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:1988 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:324 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:1992 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:1996 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2000 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:328 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:332 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:336 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[12:15], 0 addr64 offset:4048 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:336 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2020 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:340 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2024 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2028 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2032 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:344 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:348 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:352 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[12:15], 0 addr64 offset:4064 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:352 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2052 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:356 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2056 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2060 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2064 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:360 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:364 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:368 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[12:15], 0 addr64 offset:4080 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:368 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2084 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:372 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2088 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2092 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2096 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:376 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:380 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:384 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[20:23], 0 addr64 offset:3968 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:384 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2148 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:388 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2152 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2156 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2160 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:392 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:396 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:400 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[20:23], 0 addr64 offset:3984 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:400 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2180 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:404 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2184 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2188 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2192 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:408 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:412 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:416 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[20:23], 0 addr64 offset:4000 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:416 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2212 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:420 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2216 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2220 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2224 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:424 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:428 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:432 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[20:23], 0 addr64 offset:4016 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:432 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2244 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:436 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2248 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2252 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2256 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:440 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:444 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:448 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[20:23], 0 addr64 offset:4032 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:448 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2276 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:452 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2280 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2284 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2288 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:456 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:460 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:464 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[20:23], 0 addr64 offset:4048 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:464 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2308 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:468 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2312 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2316 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2320 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:472 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:476 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:480 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[20:23], 0 addr64 offset:4064 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:480 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2340 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:484 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2344 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2348 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2352 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:488 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:492 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:496 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[20:23], 0 addr64 offset:4080 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:496 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2372 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:500 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2376 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2380 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2384 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:504 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:508 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:512 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[24:27], 0 addr64 offset:3968 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:512 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2452 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:516 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2456 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2460 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2464 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:520 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:524 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:528 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[24:27], 0 addr64 offset:3984 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:528 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2484 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:532 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2488 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2492 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2496 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:536 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:540 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:544 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[24:27], 0 addr64 offset:4000 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:544 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2516 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:548 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2520 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2524 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2528 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:552 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:556 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:560 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[24:27], 0 addr64 offset:4016 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:560 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2548 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:564 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2552 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2556 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2560 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:568 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:572 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:576 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[24:27], 0 addr64 offset:4032 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:576 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2580 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:580 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2584 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2588 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2592 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:584 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:588 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:592 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[24:27], 0 addr64 offset:4048 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:592 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2612 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:596 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2616 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2620 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2624 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:600 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:604 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:608 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[24:27], 0 addr64 offset:4064 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:608 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2644 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:612 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2648 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2652 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2656 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:616 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:620 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:624 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[24:27], 0 addr64 offset:4080 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:624 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2676 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:628 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2680 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2684 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2688 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:632 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:636 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:640 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[28:31], 0 addr64 offset:3968 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:640 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2740 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:644 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2744 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2748 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2752 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:648 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:652 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:656 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[28:31], 0 addr64 offset:3984 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:656 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2772 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:660 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2776 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2780 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2784 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:664 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:668 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:672 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[28:31], 0 addr64 offset:4000 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:672 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2804 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:676 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2808 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2812 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2816 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:680 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:684 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:688 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[28:31], 0 addr64 offset:4016 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:688 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2836 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:692 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2840 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2844 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2848 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:696 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:700 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:704 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[28:31], 0 addr64 offset:4032 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:704 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2868 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:708 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2872 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2876 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2880 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:712 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:716 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:720 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[28:31], 0 addr64 offset:4048 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:720 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2900 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:724 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2904 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2908 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2912 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:728 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:732 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:736 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[28:31], 0 addr64 offset:4064 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:736 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2932 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:740 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2936 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2940 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2944 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:744 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:748 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:752 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[28:31], 0 addr64 offset:4080 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:752 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:2964 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:756 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:2968 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:2972 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:2976 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:760 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:764 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:768 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[36:39], 0 addr64 offset:3968 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:768 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:3044 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:772 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:3048 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:3052 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:3056 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:776 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:780 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:784 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[36:39], 0 addr64 offset:3984 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:784 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:3076 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:788 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:3080 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:3084 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:3088 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:792 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:796 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:800 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[36:39], 0 addr64 offset:4000 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:800 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:3108 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:804 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:3112 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:3116 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:3120 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:808 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:812 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:816 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[36:39], 0 addr64 offset:4016 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:816 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:3140 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:820 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:3144 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:3148 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:3152 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:824 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:828 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:832 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[36:39], 0 addr64 offset:4032 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:832 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:3172 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:836 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:3176 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:3180 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:3184 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:840 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:844 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:848 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[36:39], 0 addr64 offset:4048 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:848 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:3204 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:852 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:3208 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:3212 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:3216 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:856 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:860 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:864 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[36:39], 0 addr64 offset:4064 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:864 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:3236 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:868 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:3240 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:3244 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:3248 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:872 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:876 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:880 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[36:39], 0 addr64 offset:4080 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:880 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:3268 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:884 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:3272 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:3276 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:3280 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:888 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:892 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:896 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[40:43], 0 addr64 offset:3968 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:896 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:3332 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:900 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:3336 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:3340 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:3344 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:904 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:908 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:912 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[40:43], 0 addr64 offset:3984 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:912 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:3364 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:916 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:3368 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:3372 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:3376 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:920 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:924 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:928 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[40:43], 0 addr64 offset:4000 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:928 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:3396 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:932 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:3400 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:3404 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:3408 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:936 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:940 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:944 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[40:43], 0 addr64 offset:4016 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:944 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:3428 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:948 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:3432 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:3436 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:3440 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:952 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:956 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:960 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[40:43], 0 addr64 offset:4032 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:960 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:3460 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:964 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:3464 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:3468 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:3472 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:968 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:972 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:976 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[40:43], 0 addr64 offset:4048 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:976 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:3492 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:980 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:3496 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:3500 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:3504 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:984 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:988 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:992 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[40:43], 0 addr64 offset:4064 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:992 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[44:47], 0 offset:3524 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:996 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[44:47], 0 offset:3528 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[44:47], 0 offset:3532 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[44:47], 0 offset:3536 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[40:43], 0 addr64 offset:4080 -; GFX6-NEXT: s_waitcnt expcnt(3) -; GFX6-NEXT: v_add_i32_e32 v7, vcc, s0, v5 +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1000 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1004 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1008 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1008 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3556 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1012 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3560 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3564 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3568 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1016 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1020 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1024 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1024 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1028 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:16 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1032 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1036 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1040 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:16 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1040 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:20 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1044 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:24 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:28 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:32 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1048 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1052 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1056 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:32 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1056 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:36 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1060 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:40 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:44 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:48 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1064 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1068 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1072 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:48 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1072 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:52 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1076 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:56 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:60 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:64 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1080 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1084 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1088 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:64 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1088 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:68 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1092 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:72 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:76 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:80 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1096 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1100 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1104 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:80 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1104 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:84 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1108 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:88 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:92 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:96 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1112 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1116 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1120 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:96 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1120 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:100 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1124 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:104 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:108 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:112 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1128 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1132 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1136 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:112 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1136 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:116 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1140 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:120 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:124 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:128 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1144 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1148 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1152 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:128 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1152 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:132 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1156 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:136 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:140 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:144 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1160 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1164 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1168 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:144 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1168 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:148 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1172 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:152 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:156 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:160 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1176 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1180 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1184 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:160 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1184 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:164 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1188 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:168 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:172 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:176 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1192 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1196 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1200 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:176 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1200 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:180 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1204 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:184 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:188 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:192 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1208 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1212 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1216 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:192 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1216 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:196 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1220 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:200 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:204 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:208 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1224 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1228 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1232 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:208 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1232 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:212 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1236 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:216 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:220 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:224 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1240 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1244 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1248 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:224 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1248 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:228 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1252 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:232 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:236 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:240 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1256 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1260 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1264 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:240 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1264 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:244 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1268 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:248 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:252 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:256 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1272 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1276 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1280 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:256 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1280 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:260 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1284 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:264 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:268 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:272 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1288 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1292 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1296 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:272 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1296 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:276 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1300 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:280 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:284 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:288 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1304 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1308 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1312 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:288 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1312 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:292 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1316 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:296 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:300 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:304 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1320 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1324 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1328 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:304 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1328 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:308 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1332 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:312 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:316 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:320 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1336 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1340 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1344 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:320 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1344 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:324 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1348 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:328 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:332 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:336 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1352 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1356 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1360 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:336 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1360 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:340 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1364 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:344 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:348 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:352 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1368 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1372 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1376 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:352 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1376 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:356 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1380 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:360 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:364 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:368 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1384 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1388 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1392 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:368 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1392 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:372 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1396 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:376 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:380 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:384 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1400 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1404 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1408 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:384 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1408 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:388 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1412 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:392 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:396 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:400 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1416 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1420 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1424 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:400 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1424 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:404 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1428 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:408 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:412 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:416 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1432 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1436 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1440 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:416 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1440 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:420 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1444 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:424 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:428 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:432 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1448 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1452 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1456 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:432 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1456 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:436 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1460 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:440 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:444 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:448 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1464 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1468 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1472 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:448 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1472 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:452 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1476 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:456 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:460 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:464 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1480 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1484 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1488 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:464 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1488 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:468 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1492 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:472 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:476 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:480 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1496 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1500 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1504 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:480 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1504 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:484 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1508 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:488 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:492 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:496 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1512 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1516 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1520 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:496 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1520 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:500 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1524 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:504 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:508 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:512 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1528 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1532 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1536 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:512 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1536 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:516 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1540 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:520 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:524 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:528 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1544 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1548 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1552 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:528 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1552 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:532 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1556 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:536 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:540 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:544 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1560 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1564 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1568 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:544 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1568 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:548 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1572 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:552 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:556 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:560 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1576 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1580 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1584 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:560 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1584 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:564 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1588 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:568 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:572 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:576 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1592 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1596 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1600 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:576 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1600 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:580 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1604 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:584 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:588 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:592 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1608 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1612 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1616 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:592 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1616 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:596 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1620 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:600 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:604 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:608 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1624 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1628 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1632 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:608 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1632 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:612 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1636 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:616 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:620 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:624 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1640 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1644 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1648 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:624 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1648 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:628 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1652 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:632 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:636 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:640 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1656 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1660 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1664 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:640 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1664 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:644 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1668 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:648 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:652 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:656 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1672 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1676 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1680 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:656 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1680 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:660 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1684 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:664 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:668 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:672 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1688 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1692 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1696 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:672 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1696 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:676 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1700 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:680 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:684 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:688 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1704 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1708 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1712 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:688 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1712 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:692 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1716 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:696 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:700 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:704 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1720 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1724 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1728 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:704 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1728 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:708 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1732 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:712 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:716 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:720 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1736 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1740 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1744 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:720 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1744 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:724 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1748 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:728 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:732 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:736 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1752 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1756 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1760 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:736 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1760 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:740 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1764 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:744 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:748 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:752 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1768 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1772 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1776 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:752 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1776 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:756 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1780 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:760 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:764 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:768 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1784 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1788 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1792 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:768 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1792 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:772 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1796 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:776 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:780 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:784 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1800 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1804 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1808 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:784 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1808 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:788 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1812 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:792 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:796 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:800 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1816 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1820 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1824 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:800 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1824 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:804 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1828 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:808 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:812 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:816 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1832 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1836 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1840 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:816 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1840 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:820 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1844 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:824 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:828 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:832 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1848 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1852 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1856 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:832 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1856 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:836 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1860 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:840 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:844 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:848 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1864 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1868 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1872 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:848 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1872 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:852 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1876 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:856 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:860 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:864 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1880 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1884 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1888 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:864 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1888 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:868 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1892 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:872 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:876 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:880 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1896 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1900 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1904 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:880 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1904 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:884 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1908 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:888 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:892 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:896 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1912 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1916 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1920 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:896 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1920 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:900 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1924 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:904 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:908 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:912 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1928 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1932 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1936 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:912 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1936 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:916 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1940 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:920 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:924 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:928 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1944 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1948 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1952 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:928 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1952 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:932 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1956 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:936 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:940 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:944 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1960 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1964 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1968 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:944 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1968 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:948 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1972 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:952 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:956 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:960 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1976 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1980 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:1984 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:960 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1984 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:964 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:1988 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:968 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:972 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:976 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:1992 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:1996 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2000 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:976 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2000 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:980 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2004 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:984 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:988 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:992 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2008 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2012 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2016 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:992 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2016 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:996 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2020 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1000 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1004 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1008 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2024 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2028 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2032 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1008 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2032 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1012 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2036 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1016 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1020 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1024 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2040 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2044 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2048 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1024 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2048 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1028 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2052 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1032 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1036 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1040 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2056 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2060 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2064 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1040 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2064 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1044 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2068 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1048 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1052 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1056 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2072 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2076 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2080 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1056 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2080 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1060 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2084 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1064 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1068 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1072 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2088 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2092 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2096 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1072 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2096 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1076 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2100 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1080 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1084 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1088 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2104 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2108 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2112 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1088 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2112 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1092 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2116 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1096 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1100 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1104 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2120 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2124 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2128 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1104 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2128 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1108 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2132 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1112 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1116 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1120 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2136 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2140 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2144 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1120 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2144 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1124 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2148 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1128 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1132 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1136 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2152 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2156 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2160 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1136 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2160 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1140 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2164 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1144 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1148 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1152 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2168 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2172 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2176 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1152 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2176 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1156 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2180 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1160 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1164 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1168 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2184 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2188 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2192 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1168 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2192 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1172 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2196 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1176 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1180 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1184 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2200 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2204 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2208 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1184 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2208 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1188 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2212 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1192 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1196 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1200 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2216 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2220 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2224 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1200 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2224 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1204 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2228 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1208 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1212 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1216 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2232 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2236 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2240 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1216 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2240 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1220 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2244 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1224 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1228 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1232 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2248 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2252 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2256 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1232 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2256 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1236 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2260 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1240 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1244 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1248 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2264 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2268 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2272 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1248 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2272 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1252 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2276 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1256 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1260 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1264 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2280 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2284 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2288 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1264 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2288 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1284 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2292 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1288 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1292 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1296 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2296 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2300 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2304 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1280 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2304 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1316 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2308 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1320 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1324 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1328 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2312 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2316 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2320 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1296 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2320 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1348 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2324 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1352 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1356 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1360 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2328 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2332 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2336 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1312 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2336 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1380 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2340 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1384 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1388 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1392 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2344 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2348 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2352 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1328 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2352 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1412 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2356 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1416 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1420 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1424 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2360 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2364 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2368 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1344 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2368 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1444 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2372 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1448 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1452 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1456 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2376 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2380 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2384 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1360 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2384 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1476 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2388 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1480 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1484 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1488 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2392 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2396 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2400 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1376 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2400 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1508 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2404 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1512 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1516 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1520 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2408 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2412 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2416 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1392 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2416 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1524 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2420 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1528 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1532 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1536 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2424 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2428 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2432 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1408 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2432 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1540 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2436 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1544 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1548 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1552 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2440 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2444 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2448 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1424 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2448 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1572 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2452 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1576 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1580 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1584 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2456 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2460 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2464 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1440 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2464 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1604 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2468 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1608 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1612 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1616 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2472 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2476 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2480 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1456 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2480 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1636 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2484 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1640 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1644 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1648 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2488 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2492 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2496 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1472 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2496 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1668 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2500 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1672 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1676 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1680 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2504 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2508 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2512 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1488 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2512 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1700 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2516 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1704 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1708 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1712 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2520 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2524 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2528 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1504 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2528 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1732 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2532 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1736 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1740 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1744 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2536 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2540 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2544 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1520 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2544 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1764 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2548 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1768 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1772 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1776 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2552 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2556 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2560 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1536 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2560 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1796 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2564 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1800 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1804 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1808 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2568 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2572 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2576 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1552 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2576 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1812 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2580 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1816 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1820 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1824 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2584 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2588 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2592 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1568 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2592 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1828 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2596 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1832 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1836 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1840 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2600 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2604 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2608 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1584 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2608 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1844 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2612 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1848 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1852 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1856 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2616 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2620 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2624 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1600 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2624 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1876 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2628 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1880 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1884 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1888 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2632 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2636 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2640 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1616 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2640 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1908 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2644 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1912 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1916 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1920 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2648 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2652 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2656 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1632 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2656 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1940 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2660 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1944 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1948 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1952 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2664 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2668 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2672 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1648 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2672 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:1972 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2676 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:1976 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:1980 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:1984 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2680 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2684 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2688 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1664 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2688 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2004 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2692 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2008 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2012 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2016 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2696 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2700 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2704 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1680 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2704 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2036 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2708 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2040 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2044 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2048 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2712 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2716 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2720 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1696 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2720 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2068 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2724 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2072 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2076 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2080 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2728 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2732 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2736 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1712 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2736 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2100 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2740 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2104 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2108 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2112 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2744 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2748 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2752 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1728 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2752 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2116 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2756 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2120 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2124 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2128 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2760 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2764 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2768 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1744 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2768 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2132 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2772 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2136 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2140 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2144 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2776 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2780 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2784 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1760 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2784 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2164 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2788 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2168 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2172 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2176 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2792 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2796 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2800 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1776 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2800 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2196 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2804 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2200 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2204 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2208 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2808 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2812 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2816 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1792 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2816 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2228 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2820 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2232 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2236 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2240 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2824 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2828 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2832 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1808 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2832 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2260 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2836 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2264 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2268 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2272 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2840 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2844 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2848 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1824 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2848 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2292 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2852 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2296 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2300 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2304 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2856 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2860 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2864 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1840 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2864 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2324 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2868 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2328 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2332 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2336 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2872 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2876 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2880 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1856 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2880 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2356 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2884 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2360 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2364 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2368 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2888 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2892 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2896 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1872 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2896 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2388 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2900 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2392 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2396 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2400 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2904 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2908 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2912 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1888 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2912 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2404 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2916 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2408 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2412 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2416 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2920 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2924 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2928 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1904 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2928 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2420 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2932 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2424 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2428 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2432 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2936 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2940 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2944 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1920 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2944 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2436 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2948 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2440 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2444 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2448 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2952 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2956 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2960 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1936 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2960 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2468 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2964 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2472 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2476 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2480 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2968 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2972 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2976 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1952 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2976 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2500 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2980 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2504 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2508 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2512 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:2984 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:2988 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:2992 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1968 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2992 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2532 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:2996 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2536 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2540 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2544 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3000 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3004 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3008 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1984 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3008 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2564 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3012 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2568 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2572 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2576 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3016 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3020 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3024 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2000 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3024 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2596 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3028 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2600 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2604 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2608 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3032 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3036 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3040 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2016 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3040 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2628 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3044 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2632 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2636 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2640 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3048 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3052 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3056 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2032 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3056 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2660 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3060 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2664 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2668 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2672 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3064 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3068 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3072 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2048 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3072 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2692 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3076 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2696 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2700 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2704 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3080 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3084 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3088 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2064 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3088 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2708 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3092 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2712 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2716 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2720 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3096 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3100 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3104 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2080 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3104 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2724 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3108 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2728 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2732 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2736 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3112 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3116 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3120 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2096 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3120 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2756 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3124 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2760 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2764 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2768 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3128 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3132 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3136 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2112 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3136 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2788 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3140 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2792 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2796 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2800 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3144 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3148 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3152 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2128 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3152 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2820 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3156 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2824 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2828 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2832 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3160 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3164 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3168 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2144 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3168 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2852 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3172 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2856 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2860 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2864 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3176 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3180 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3184 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2160 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3184 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2884 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3188 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2888 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2892 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2896 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3192 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3196 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3200 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2176 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3200 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2916 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3204 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2920 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2924 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2928 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3208 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3212 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3216 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2192 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3216 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2948 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3220 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2952 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2956 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2960 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3224 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3228 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3232 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2208 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3232 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2980 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3236 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:2984 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:2988 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:2992 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3240 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3244 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3248 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2224 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3248 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:2996 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3252 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3000 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3004 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3008 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3256 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3260 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3264 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2240 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3264 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3012 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3268 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3016 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3020 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3024 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3272 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3276 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3280 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2256 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3280 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3028 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3284 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3032 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3036 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3040 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3288 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3292 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3296 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2272 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3296 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3060 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3300 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3064 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3068 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3072 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3304 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3308 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3312 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2288 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3312 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3092 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3316 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3096 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3100 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3104 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3320 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3324 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3328 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2304 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3328 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3124 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3332 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3128 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3132 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3136 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3336 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3340 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3344 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2320 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3344 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3156 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3348 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3160 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3164 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3168 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3352 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3356 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3360 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2336 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3360 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3188 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3364 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3192 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3196 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3200 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3368 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3372 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3376 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2352 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3376 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3220 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3380 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3224 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3228 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3232 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3384 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3388 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3392 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2368 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3392 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3252 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3396 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3256 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3260 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3264 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3400 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3404 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3408 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2384 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3408 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3284 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3412 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3288 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3292 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3296 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3416 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3420 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3424 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2400 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3424 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3300 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3428 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3304 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3308 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3312 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3432 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3436 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3440 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2416 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3440 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3316 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3444 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3320 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3324 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3328 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3448 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3452 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3456 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2432 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3456 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3348 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3460 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3352 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3356 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3360 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3464 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3468 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3472 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2448 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3472 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3380 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3476 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3384 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3388 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3392 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3480 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3484 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3488 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2464 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3488 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3412 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3492 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3416 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3420 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3424 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3496 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3500 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3504 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2480 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3504 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3444 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3508 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3448 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3452 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3456 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3512 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3516 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3520 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2496 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3520 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3476 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3524 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3480 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3484 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3488 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3528 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3532 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3536 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2512 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3536 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3508 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3540 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3512 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3516 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3520 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3544 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3548 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3552 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2528 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3552 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3540 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3556 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3544 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3548 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3552 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3560 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3564 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3568 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2544 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3568 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3572 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3572 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3576 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3580 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3584 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3576 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3580 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3584 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2560 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3584 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3588 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3588 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3592 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3596 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3600 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3592 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3596 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3600 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2576 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3600 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3604 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3604 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3608 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3612 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3616 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3608 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3612 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3616 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2592 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3616 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3620 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3620 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3624 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3628 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3632 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3624 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3628 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3632 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2608 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3632 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3636 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3636 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3640 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3644 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3648 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3640 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3644 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3648 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2624 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3648 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3652 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3652 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3656 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3660 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3664 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3656 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3660 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3664 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2640 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3664 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3668 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3668 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3672 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3676 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3680 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3672 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3676 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3680 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2656 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3680 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3684 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3684 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3688 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3692 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3696 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3688 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3692 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3696 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2672 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3696 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3700 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3700 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3704 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3708 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3712 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3704 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3708 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3712 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2688 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3712 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3716 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3716 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3720 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3724 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3728 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3720 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3724 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3728 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2704 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3728 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3732 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3732 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3736 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3740 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3744 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3736 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3740 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3744 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2720 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3744 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3748 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3748 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3752 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3756 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3760 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3752 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3756 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3760 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2736 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3760 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3764 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3764 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3768 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3772 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3776 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3768 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3772 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3776 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2752 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3776 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3780 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3780 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3784 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3788 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3792 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3784 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3788 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3792 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2768 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3792 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3796 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3796 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3800 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3804 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3808 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3800 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3804 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3808 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2784 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3808 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3812 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3812 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3816 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3820 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3824 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3816 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3820 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3824 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2800 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3824 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3828 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3828 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3832 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3836 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3840 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3832 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3836 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3840 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2816 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3840 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3844 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3844 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3848 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3852 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3856 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3848 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3852 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3856 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2832 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3856 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3860 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3860 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3864 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3868 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3872 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3864 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3868 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3872 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2848 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3872 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3876 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3876 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3880 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3884 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3888 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3880 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3884 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3888 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2864 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3888 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3892 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3892 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3896 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3900 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3904 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3896 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3900 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3904 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2880 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3904 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3908 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3908 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3912 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3916 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3920 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3912 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3916 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3920 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2896 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3920 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3924 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3924 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3928 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3932 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3936 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3928 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3932 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3936 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2912 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3936 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3940 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3940 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3944 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3948 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3952 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3944 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3948 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3952 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2928 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3952 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3956 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3956 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3960 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3964 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3968 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3960 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3964 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3968 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2944 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3968 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3972 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3972 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3976 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3980 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:3984 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3976 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3980 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:3984 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2960 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3984 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:3988 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:3988 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:3992 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:3996 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:4000 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:3992 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:3996 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:4000 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2976 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:4000 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:4004 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:4004 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:4008 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:4012 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:4016 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4008 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4012 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:4016 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2992 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:4016 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:4020 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:4020 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:4024 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:4028 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:4032 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4024 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4028 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:4032 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3008 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:4032 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:4036 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:4036 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:4040 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:4044 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:4048 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4040 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4044 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:4048 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3024 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:4048 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:4052 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:4052 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:4056 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:4060 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:4064 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4056 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4060 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:4064 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3040 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:4064 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], 0 offset:4068 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:4068 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], 0 offset:4072 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], 0 offset:4076 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], 0 offset:4080 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4072 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4076 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:4080 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3056 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:4080 +; GFX6-NEXT: s_mov_b64 s[4:5], 0x80 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3072 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:3968 ; GFX6-NEXT: s_mov_b32 s2, 0x40100 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3088 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:3984 ; GFX6-NEXT: s_mov_b32 s2, 0x40500 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3104 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:4000 ; GFX6-NEXT: s_mov_b32 s2, 0x40900 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3120 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:4016 ; GFX6-NEXT: s_mov_b32 s2, 0x40d00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3136 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:4032 ; GFX6-NEXT: s_mov_b32 s2, 0x41100 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3152 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:4048 ; GFX6-NEXT: s_mov_b32 s2, 0x41500 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3168 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:4064 ; GFX6-NEXT: s_mov_b32 s2, 0x41900 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3184 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:4080 ; GFX6-NEXT: s_mov_b32 s2, 0x41d00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3200 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:3968 ; GFX6-NEXT: s_mov_b32 s2, 0x42100 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3216 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:3984 ; GFX6-NEXT: s_mov_b32 s2, 0x42500 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3232 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:4000 ; GFX6-NEXT: s_mov_b32 s2, 0x42900 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3248 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:4016 ; GFX6-NEXT: s_mov_b32 s2, 0x42d00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3264 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:4032 ; GFX6-NEXT: s_mov_b32 s2, 0x43100 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3280 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:4048 ; GFX6-NEXT: s_mov_b32 s2, 0x43500 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3296 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:4064 ; GFX6-NEXT: s_mov_b32 s2, 0x43900 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3312 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:4080 ; GFX6-NEXT: s_mov_b32 s2, 0x43d00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3328 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:3968 ; GFX6-NEXT: s_mov_b32 s2, 0x44100 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3344 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:3984 ; GFX6-NEXT: s_mov_b32 s2, 0x44500 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3360 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:4000 ; GFX6-NEXT: s_mov_b32 s2, 0x44900 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3376 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:4016 ; GFX6-NEXT: s_mov_b32 s2, 0x44d00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3392 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:4032 ; GFX6-NEXT: s_mov_b32 s2, 0x45100 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3408 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:4048 ; GFX6-NEXT: s_mov_b32 s2, 0x45500 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3424 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:4064 ; GFX6-NEXT: s_mov_b32 s2, 0x45900 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3440 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:4080 ; GFX6-NEXT: s_mov_b32 s2, 0x45d00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3456 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:3968 ; GFX6-NEXT: s_mov_b32 s2, 0x46100 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3472 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:3984 ; GFX6-NEXT: s_mov_b32 s2, 0x46500 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3488 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:4000 ; GFX6-NEXT: s_mov_b32 s2, 0x46900 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3504 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:4016 ; GFX6-NEXT: s_mov_b32 s2, 0x46d00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3520 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:4032 ; GFX6-NEXT: s_mov_b32 s2, 0x47100 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3536 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:4048 ; GFX6-NEXT: s_mov_b32 s2, 0x47500 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3552 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:4064 ; GFX6-NEXT: s_mov_b32 s2, 0x47900 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3568 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:4080 ; GFX6-NEXT: s_mov_b32 s2, 0x47d00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3584 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:3968 ; GFX6-NEXT: s_mov_b32 s2, 0x48100 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3600 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:3984 ; GFX6-NEXT: s_mov_b32 s2, 0x48500 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3616 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:4000 ; GFX6-NEXT: s_mov_b32 s2, 0x48900 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3632 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:4016 ; GFX6-NEXT: s_mov_b32 s2, 0x48d00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3648 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:4032 ; GFX6-NEXT: s_mov_b32 s2, 0x49100 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3664 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:4048 ; GFX6-NEXT: s_mov_b32 s2, 0x49500 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3680 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:4064 ; GFX6-NEXT: s_mov_b32 s2, 0x49900 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3696 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:4080 ; GFX6-NEXT: s_mov_b32 s2, 0x49d00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3712 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:3968 ; GFX6-NEXT: s_mov_b32 s2, 0x4a100 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3728 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:3984 ; GFX6-NEXT: s_mov_b32 s2, 0x4a500 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3744 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:4000 ; GFX6-NEXT: s_mov_b32 s2, 0x4a900 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3760 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:4016 ; GFX6-NEXT: s_mov_b32 s2, 0x4ad00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3776 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:4032 ; GFX6-NEXT: s_mov_b32 s2, 0x4b100 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3792 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:4048 ; GFX6-NEXT: s_mov_b32 s2, 0x4b500 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3808 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:4064 ; GFX6-NEXT: s_mov_b32 s2, 0x4b900 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3824 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:4080 ; GFX6-NEXT: s_mov_b32 s2, 0x4bd00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3840 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:3968 ; GFX6-NEXT: s_mov_b32 s2, 0x4c100 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3856 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:3984 ; GFX6-NEXT: s_mov_b32 s2, 0x4c500 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3872 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:4000 ; GFX6-NEXT: s_mov_b32 s2, 0x4c900 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3888 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:4016 ; GFX6-NEXT: s_mov_b32 s2, 0x4cd00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3904 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:4032 ; GFX6-NEXT: s_mov_b32 s2, 0x4d100 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3920 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:4048 ; GFX6-NEXT: s_mov_b32 s2, 0x4d500 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3936 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:4064 ; GFX6-NEXT: s_mov_b32 s2, 0x4d900 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3952 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:4080 ; GFX6-NEXT: s_mov_b32 s2, 0x4dd00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3968 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:3968 ; GFX6-NEXT: s_mov_b32 s2, 0x4e100 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3984 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:3984 ; GFX6-NEXT: s_mov_b32 s2, 0x4e500 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:4000 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:4000 ; GFX6-NEXT: s_mov_b32 s2, 0x4e900 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:4016 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:4016 ; GFX6-NEXT: s_mov_b32 s2, 0x4ed00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:4032 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:4032 ; GFX6-NEXT: s_mov_b32 s2, 0x4f100 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:4048 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:4048 ; GFX6-NEXT: s_mov_b32 s2, 0x4f500 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:4064 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:4064 ; GFX6-NEXT: s_mov_b32 s2, 0x4f900 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:4080 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:4080 ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: v_add_i32_e32 v7, vcc, s0, v5 +; GFX6-NEXT: v_mov_b32_e32 v4, s1 +; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ;;#ASMEND ; GFX6-NEXT: ;;#ASMSTART @@ -2690,2304 +2684,2303 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: ;;#ASMEND ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ;;#ASMEND -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:3556 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:3560 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:3564 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:3568 ; 4-byte Folded Reload -; GFX6-NEXT: v_mov_b32_e32 v4, s1 -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc -; GFX6-NEXT: s_mov_b64 s[2:3], s[18:19] +; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[40:43], 0 addr64 offset:4080 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:4080 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:3524 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:3528 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:3532 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:3536 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s33, 0x4f500 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[40:43], 0 addr64 offset:4064 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:4064 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:3492 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:3496 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:3500 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:3504 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s33, 0x4f100 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[40:43], 0 addr64 offset:4048 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:4048 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:3460 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:3464 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:3468 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:3472 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s33, 0x4ed00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[40:43], 0 addr64 offset:4032 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:4032 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:3428 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:3432 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:3436 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:3440 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s33, 0x4e900 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[40:43], 0 addr64 offset:4016 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:4016 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:3396 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:3400 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:3404 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:3408 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s33, 0x4e500 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[40:43], 0 addr64 offset:4000 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:4000 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:3364 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:3368 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:3372 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:3376 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s33, 0x4e100 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[40:43], 0 addr64 offset:3984 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:3984 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:3332 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:3336 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:3340 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:3344 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s33, 0x4dd00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[40:43], 0 addr64 offset:3968 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:3968 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:3268 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:3272 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:3276 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:3280 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s33, 0x4d900 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[36:39], 0 addr64 offset:4080 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:4080 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:3236 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:3240 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:3244 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:3248 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s33, 0x4d500 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[36:39], 0 addr64 offset:4064 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:4064 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:3204 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:3208 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:3212 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:3216 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s33, 0x4d100 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[36:39], 0 addr64 offset:4048 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:4048 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:3172 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:3176 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:3180 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:3184 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s33, 0x4cd00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[36:39], 0 addr64 offset:4032 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:4032 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:3140 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:3144 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:3148 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:3152 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s33, 0x4c900 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[36:39], 0 addr64 offset:4016 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:4016 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:3108 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:3112 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:3116 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:3120 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s33, 0x4c500 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[36:39], 0 addr64 offset:4000 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:4000 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:3076 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:3080 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:3084 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:3088 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s33, 0x4c100 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[36:39], 0 addr64 offset:3984 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:3984 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:3044 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:3048 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:3052 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:3056 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[36:39], 0 addr64 offset:3968 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:3968 +; GFX6-NEXT: s_mov_b32 s28, 0x4bd00 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2964 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2968 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2972 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2976 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s28 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s28 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s28 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s28 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s28, 0x4b900 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[28:31], 0 addr64 offset:4080 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:4080 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2932 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2936 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2940 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2944 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s28 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s28 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s28 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s28 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s28, 0x4b500 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[28:31], 0 addr64 offset:4064 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:4064 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2900 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2904 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2908 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2912 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s28 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s28 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s28 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s28 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s28, 0x4b100 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[28:31], 0 addr64 offset:4048 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:4048 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2868 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2872 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2876 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2880 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s28 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s28 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s28 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s28 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s28, 0x4ad00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[28:31], 0 addr64 offset:4032 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:4032 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2836 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2840 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2844 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2848 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s28 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s28 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s28 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s28 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s28, 0x4a900 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[28:31], 0 addr64 offset:4016 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:4016 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2804 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2808 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2812 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2816 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s28 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s28 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s28 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s28 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s28, 0x4a500 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[28:31], 0 addr64 offset:4000 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:4000 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2772 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2776 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2780 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2784 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s28 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s28 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s28 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s28 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s28, 0x4a100 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[28:31], 0 addr64 offset:3984 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:3984 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2740 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2744 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2748 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2752 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s28 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s28 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s28 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s28 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[28:31], 0 addr64 offset:3968 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:3968 +; GFX6-NEXT: s_mov_b32 s24, 0x49d00 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2676 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2680 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2684 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2688 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s24 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s24 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s24 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s24 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s24, 0x49900 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[24:27], 0 addr64 offset:4080 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:4080 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2644 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2648 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2652 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2656 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s24 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s24 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s24 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s24 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s24, 0x49500 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[24:27], 0 addr64 offset:4064 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:4064 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2612 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2616 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2620 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2624 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s24 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s24 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s24 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s24 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s24, 0x49100 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[24:27], 0 addr64 offset:4048 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:4048 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2580 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2584 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2588 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2592 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s24 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s24 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s24 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s24 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s24, 0x48d00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[24:27], 0 addr64 offset:4032 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:4032 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2548 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2552 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2556 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2560 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s24 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s24 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s24 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s24 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s24, 0x48900 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[24:27], 0 addr64 offset:4016 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:4016 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2516 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2520 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2524 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2528 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s24 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s24 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s24 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s24 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s24, 0x48500 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[24:27], 0 addr64 offset:4000 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:4000 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2484 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2488 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2492 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2496 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s24 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s24 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s24 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s24 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s24, 0x48100 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[24:27], 0 addr64 offset:3984 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:3984 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2452 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2456 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2460 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2464 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s24 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s24 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s24 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s24 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[24:27], 0 addr64 offset:3968 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:3968 +; GFX6-NEXT: s_mov_b32 s20, 0x47d00 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2372 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2376 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2380 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2384 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s20 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s20 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s20 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s20 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s20, 0x47900 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[20:23], 0 addr64 offset:4080 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:4080 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2340 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2344 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2348 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2352 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s20 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s20 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s20 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s20 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s20, 0x47500 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[20:23], 0 addr64 offset:4064 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:4064 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2308 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2312 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2316 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2320 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s20 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s20 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s20 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s20 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s20, 0x47100 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[20:23], 0 addr64 offset:4048 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:4048 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2276 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2280 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2284 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2288 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s20 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s20 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s20 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s20 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s20, 0x46d00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[20:23], 0 addr64 offset:4032 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:4032 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2244 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2248 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2252 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2256 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s20 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s20 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s20 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s20 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s20, 0x46900 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[20:23], 0 addr64 offset:4016 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:4016 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2212 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2216 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2220 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2224 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s20 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s20 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s20 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s20 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s20, 0x46500 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[20:23], 0 addr64 offset:4000 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:4000 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2180 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2184 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2188 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2192 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s20 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s20 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s20 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s20 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s20, 0x46100 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[20:23], 0 addr64 offset:3984 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:3984 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2148 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2152 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2156 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2160 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s20 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s20 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s20 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s20 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[20:23], 0 addr64 offset:3968 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:3968 +; GFX6-NEXT: s_mov_b32 s16, 0x45d00 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2084 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2088 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2092 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2096 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s16 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s16 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s16 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s16 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s16, 0x45900 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[12:15], 0 addr64 offset:4080 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:4080 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2052 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2056 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2060 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2064 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s16 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s16 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s16 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s16 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s16, 0x45500 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[12:15], 0 addr64 offset:4064 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:4064 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:2020 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:2024 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:2028 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2032 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s16 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s16 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s16 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s16 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s16, 0x45100 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[12:15], 0 addr64 offset:4048 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:4048 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1988 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1992 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1996 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:2000 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s16 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s16 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s16 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s16 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s16, 0x44d00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[12:15], 0 addr64 offset:4032 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:4032 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1956 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1960 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1964 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1968 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s16 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s16 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s16 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s16 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s16, 0x44900 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[12:15], 0 addr64 offset:4016 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:4016 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1924 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1928 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1932 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1936 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s16 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s16 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s16 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s16 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s16, 0x44500 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[12:15], 0 addr64 offset:4000 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:4000 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1892 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1896 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1900 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1904 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s16 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s16 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s16 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s16 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s16, 0x44100 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[12:15], 0 addr64 offset:3984 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:3984 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1860 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1864 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1868 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1872 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s16 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s16 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s16 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s16 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[12:15], 0 addr64 offset:3968 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:3968 +; GFX6-NEXT: s_mov_b32 s12, 0x43d00 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1780 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1784 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1788 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1792 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s12 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s12 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s12 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s12, 0x43900 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[8:11], 0 addr64 offset:4080 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:4080 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1748 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1752 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1756 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1760 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s12 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s12 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s12 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s12, 0x43500 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[8:11], 0 addr64 offset:4064 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:4064 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1716 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1720 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1724 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1728 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s12 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s12 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s12 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s12, 0x43100 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[8:11], 0 addr64 offset:4048 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:4048 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1684 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1688 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1692 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1696 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s12 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s12 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s12 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s12, 0x42d00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[8:11], 0 addr64 offset:4032 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:4032 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1652 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1656 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1660 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1664 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s12 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s12 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s12 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s12, 0x42900 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[8:11], 0 addr64 offset:4016 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:4016 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1620 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1624 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1628 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1632 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s12 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s12 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s12 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s12, 0x42500 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[8:11], 0 addr64 offset:4000 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:4000 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1588 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1592 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1596 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1600 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s12 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s12 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s12 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s12, 0x42100 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[8:11], 0 addr64 offset:3984 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:3984 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1556 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1560 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1564 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1568 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s12 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s12 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s12 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[8:11], 0 addr64 offset:3968 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:3968 +; GFX6-NEXT: s_mov_b32 s8, 0x41d00 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1492 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1496 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1500 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1504 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s8 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s8 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s8 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s8, 0x41900 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:4080 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:4080 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1460 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1464 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1468 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1472 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s8 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s8 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s8 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s8, 0x41500 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:4064 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:4064 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1428 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1432 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1436 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1440 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s8 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s8 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s8 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s8, 0x41100 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:4048 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:4048 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1396 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1400 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1404 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1408 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s8 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s8 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s8 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s8, 0x40d00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:4032 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:4032 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1364 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1368 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1372 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1376 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s8 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s8 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s8 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s8, 0x40900 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:4016 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:4016 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1332 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1336 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1340 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1344 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s8 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s8 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s8 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s8, 0x40500 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:4000 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:4000 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1300 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1304 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1308 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1312 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s8 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s8 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s8 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s8, 0x40100 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:3984 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:3984 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[44:47], 0 offset:1268 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[44:47], 0 offset:1272 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v11, off, s[44:47], 0 offset:1276 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v12, off, s[44:47], 0 offset:1280 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s8 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s8 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s8 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:3968 +; GFX6-NEXT: s_mov_b32 s4, 0x3fd00 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:3968 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:4080 -; GFX6-NEXT: s_mov_b32 s4, 0x4f900 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x4f500 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:4068 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4072 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4076 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:4080 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:4064 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x4f100 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:4052 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4056 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4060 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:4064 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:4048 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x4ed00 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:4036 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4040 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4044 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:4048 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:4032 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x4e900 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:4020 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4024 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4028 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:4032 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:4016 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x4e500 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:4004 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4008 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4012 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:4016 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:4000 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x4e100 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3988 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3992 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3996 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:4000 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3984 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x4dd00 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3972 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3976 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3980 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3984 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3968 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x4d900 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3956 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3960 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3964 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3968 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3952 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x4d500 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3940 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3944 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3948 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3952 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3936 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x4d100 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3924 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3928 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3932 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3936 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3920 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x4cd00 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3908 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3912 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3916 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3920 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3904 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x4c900 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3892 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3896 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3900 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3904 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3888 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x4c500 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3876 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3880 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3884 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3888 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3872 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x4c100 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3860 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3864 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3868 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3872 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3856 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x4bd00 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3844 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3848 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3852 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3856 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3840 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x4b900 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3828 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3832 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3836 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3840 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3824 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x4b500 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3812 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3816 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3820 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3824 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3808 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x4b100 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3796 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3800 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3804 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3808 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3792 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x4ad00 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3780 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3784 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3788 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3792 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3776 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x4a900 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3764 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3768 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3772 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3776 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3760 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x4a500 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3748 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3752 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3756 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3760 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3744 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x4a100 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3732 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3736 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3740 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3744 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3728 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x49d00 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3716 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3720 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3724 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3728 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3712 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x49900 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3700 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3704 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3708 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3712 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3696 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x49500 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3684 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3688 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3692 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3696 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3680 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x49100 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3668 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3672 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3676 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3680 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3664 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x48d00 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3652 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3656 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3660 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3664 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3648 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x48900 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3636 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3640 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3644 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3648 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3632 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x48500 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3620 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3624 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3628 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3632 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3616 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x48100 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3604 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3608 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3612 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3616 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3600 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x47d00 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3588 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3592 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3596 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3600 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3584 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x47900 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3572 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3576 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3580 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3584 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3568 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x47500 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3556 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3560 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3564 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3568 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3552 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x47100 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3540 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3544 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3548 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3552 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3536 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x46d00 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3524 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3528 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3532 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3536 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3520 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x46900 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3508 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3512 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3516 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3520 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3504 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x46500 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3492 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3496 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3500 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3504 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3488 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x46100 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3476 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3480 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3484 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3488 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3472 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x45d00 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3460 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3464 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3468 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3472 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3456 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x45900 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3444 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3448 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3452 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3456 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3440 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x45500 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3428 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3432 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3436 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3440 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3424 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x45100 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3412 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3416 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3420 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3424 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3408 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x44d00 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3396 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3400 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3404 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3408 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3392 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x44900 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3380 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3384 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3388 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3392 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3376 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x44500 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3364 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3368 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3372 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3376 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3360 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x44100 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3348 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3352 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3356 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3360 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3344 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x43d00 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3332 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3336 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3340 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3344 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3328 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x43900 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3316 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3320 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3324 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3328 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3312 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x43500 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3300 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3304 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3308 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3312 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3296 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x43100 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3284 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3288 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3292 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3296 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3280 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x42d00 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3268 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3272 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3276 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3280 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3264 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x42900 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3252 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3256 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3260 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3264 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3248 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x42500 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3236 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3240 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3244 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3248 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3232 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x42100 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3220 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3224 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3228 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3232 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3216 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x41d00 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3204 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3208 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3212 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3216 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3200 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x41900 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3188 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3192 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3196 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3200 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3184 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x41500 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3172 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3176 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3180 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3184 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3168 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x41100 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3156 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3160 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3164 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3168 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3152 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x40d00 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3140 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3144 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3148 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3152 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3136 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x40900 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3124 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3128 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3132 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3136 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3120 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x40500 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3108 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3112 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3116 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3120 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3104 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x40100 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3092 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3096 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3100 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3104 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3088 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x3fd00 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3076 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3080 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3084 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3088 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3072 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3060 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3064 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3068 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3072 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3056 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:4068 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:4072 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:4076 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:4080 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3044 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3048 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3052 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3056 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3040 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:4052 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:4056 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:4060 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:4064 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3028 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3032 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3036 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3040 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3024 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:4036 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:4040 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:4044 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:4048 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:3012 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3016 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3020 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3024 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3008 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:4020 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:4024 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:4028 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:4032 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2996 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:3000 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:3004 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:3008 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2992 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:4004 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:4008 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:4012 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:4016 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2980 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2984 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2988 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2992 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2976 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3988 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3992 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3996 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:4000 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2964 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2968 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2972 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2976 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2960 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3972 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3976 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3980 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3984 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2948 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2952 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2956 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2960 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2944 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3956 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3960 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3964 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3968 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2932 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2936 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2940 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2944 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2928 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3940 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3944 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3948 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3952 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2916 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2920 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2924 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2928 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2912 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3924 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3928 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3932 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3936 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2900 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2904 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2908 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2912 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2896 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3908 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3912 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3916 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3920 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2884 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2888 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2892 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2896 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2880 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3892 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3896 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3900 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3904 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2868 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2872 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2876 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2880 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2864 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3876 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3880 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3884 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3888 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2852 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2856 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2860 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2864 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2848 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3860 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3864 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3868 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3872 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2836 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2840 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2844 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2848 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2832 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3844 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3848 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3852 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3856 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2820 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2824 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2828 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2832 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2816 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3828 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3832 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3836 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3840 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2804 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2808 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2812 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2816 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2800 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3812 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3816 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3820 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3824 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2788 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2792 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2796 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2800 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2784 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3796 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3800 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3804 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3808 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2772 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2776 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2780 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2784 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2768 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3780 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3784 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3788 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3792 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2756 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2760 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2764 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2768 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2752 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3764 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3768 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3772 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3776 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2740 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2744 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2748 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2752 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2736 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3748 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3752 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3756 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3760 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2724 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2728 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2732 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2736 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2720 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3732 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3736 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3740 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3744 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2708 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2712 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2716 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2720 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2704 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3716 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3720 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3724 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3728 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2692 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2696 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2700 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2704 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2688 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3700 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3704 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3708 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3712 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2676 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2680 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2684 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2688 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2672 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3684 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3688 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3692 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3696 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2660 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2664 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2668 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2672 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2656 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3668 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3672 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3676 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3680 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2644 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2648 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2652 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2656 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2640 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3652 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3656 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3660 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3664 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2628 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2632 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2636 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2640 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2624 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3636 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3640 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3644 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3648 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2612 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2616 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2620 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2624 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2608 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3620 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3624 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3628 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3632 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2596 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2600 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2604 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2608 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2592 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3604 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3608 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3612 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3616 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2580 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2584 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2588 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2592 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2576 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3588 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3592 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3596 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3600 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2564 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2568 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2572 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2576 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2560 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3572 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3576 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3580 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3584 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2548 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2552 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2556 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2560 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2544 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3540 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3544 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3548 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3552 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2532 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2536 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2540 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2544 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2528 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3508 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3512 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3516 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3520 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2516 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2520 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2524 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2528 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2512 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3476 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3480 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3484 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3488 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2500 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2504 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2508 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2512 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2496 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3444 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3448 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3452 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3456 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2484 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2488 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2492 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2496 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2480 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3412 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3416 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3420 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3424 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2468 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2472 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2476 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2480 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2464 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3380 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3384 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3388 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3392 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2452 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2456 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2460 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2464 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2448 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3348 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3352 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3356 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3360 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2436 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2440 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2444 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2448 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2432 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3316 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3320 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3324 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3328 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2420 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2424 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2428 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2432 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2416 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3300 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3304 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3308 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3312 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2404 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2408 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2412 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2416 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2400 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3284 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3288 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3292 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3296 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2388 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2392 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2396 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2400 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2384 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3252 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3256 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3260 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3264 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2372 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2376 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2380 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2384 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2368 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3220 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3224 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3228 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3232 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2356 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2360 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2364 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2368 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2352 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3188 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3192 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3196 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3200 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2340 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2344 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2348 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2352 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2336 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3156 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3160 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3164 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3168 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2324 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2328 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2332 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2336 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2320 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3124 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3128 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3132 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3136 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2308 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2312 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2316 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2320 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2304 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3092 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3096 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3100 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3104 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2292 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2296 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2300 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2304 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2288 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3060 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3064 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3068 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3072 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2276 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2280 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2284 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2288 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2272 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3028 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3032 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3036 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3040 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2260 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2264 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2268 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2272 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2256 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:3012 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3016 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3020 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3024 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2244 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2248 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2252 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2256 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2240 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2996 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:3000 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:3004 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:3008 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2228 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2232 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2236 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2240 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2224 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2980 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2984 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2988 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2992 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2212 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2216 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2220 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2224 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2208 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2948 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2952 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2956 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2960 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2196 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2200 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2204 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2208 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2192 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2916 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2920 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2924 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2928 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2180 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2184 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2188 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2192 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2176 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2884 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2888 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2892 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2896 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2164 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2168 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2172 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2176 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2160 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2852 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2856 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2860 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2864 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2148 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2152 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2156 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2160 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2144 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2820 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2824 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2828 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2832 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2132 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2136 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2140 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2144 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2128 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2788 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2792 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2796 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2800 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2116 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2120 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2124 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2128 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2112 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2756 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2760 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2764 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2768 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2100 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2104 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2108 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2112 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2096 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2724 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2728 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2732 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2736 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2084 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2088 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2092 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2096 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2080 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2708 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2712 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2716 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2720 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2068 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2072 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2076 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2080 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2064 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2692 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2696 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2700 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2704 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2052 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2056 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2060 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2064 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2048 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2660 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2664 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2668 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2672 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2036 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2040 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2044 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2048 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2032 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2628 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2632 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2636 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2640 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2020 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2024 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2028 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2032 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2016 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2596 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2600 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2604 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2608 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:2004 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:2008 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:2012 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2016 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2000 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2564 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2568 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2572 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2576 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1988 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1992 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1996 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:2000 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1984 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2532 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2536 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2540 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2544 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1972 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1976 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1980 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1984 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1968 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2500 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2504 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2508 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2512 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1956 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1960 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1964 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1968 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1952 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2468 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2472 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2476 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2480 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1940 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1944 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1948 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1952 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1936 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2436 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2440 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2444 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2448 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1924 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1928 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1932 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1936 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1920 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2420 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2424 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2428 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2432 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1908 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1912 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1916 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1920 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1904 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2404 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2408 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2412 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2416 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1892 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1896 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1900 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1904 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1888 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2388 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2392 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2396 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2400 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1876 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1880 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1884 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1888 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1872 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2356 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2360 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2364 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2368 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1860 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1864 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1868 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1872 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1856 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2324 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2328 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2332 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2336 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1844 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1848 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1852 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1856 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1840 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2292 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2296 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2300 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2304 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1828 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1832 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1836 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1840 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1824 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2260 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2264 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2268 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2272 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1812 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1816 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1820 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1824 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1808 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2228 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2232 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2236 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2240 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1796 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1800 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1804 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1808 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1792 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2196 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2200 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2204 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2208 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1780 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1784 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1788 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1792 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1776 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2164 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2168 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2172 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2176 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1764 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1768 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1772 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1776 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1760 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2132 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2136 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2140 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2144 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1748 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1752 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1756 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1760 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1744 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2116 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2120 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2124 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2128 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1732 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1736 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1740 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1744 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1728 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2100 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2104 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2108 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2112 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1716 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1720 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1724 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1728 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1712 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2068 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2072 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2076 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2080 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1700 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1704 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1708 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1712 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1696 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2036 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2040 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2044 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2048 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1684 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1688 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1692 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1696 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1680 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:2004 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:2008 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:2012 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:2016 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1668 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1672 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1676 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1680 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1664 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1972 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1976 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1980 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1984 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1652 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1656 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1660 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1664 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1648 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1940 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1944 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1948 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1952 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1636 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1640 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1644 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1648 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1632 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1908 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1912 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1916 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1920 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1620 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1624 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1628 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1632 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1616 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1876 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1880 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1884 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1888 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1604 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1608 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1612 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1616 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1600 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1844 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1848 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1852 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1856 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1588 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1592 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1596 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1600 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1584 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1828 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1832 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1836 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1840 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1572 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1576 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1580 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1584 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1568 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1812 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1816 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1820 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1824 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1556 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1560 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1564 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1568 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1552 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1796 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1800 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1804 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1808 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1540 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1544 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1548 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1552 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1536 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1764 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1768 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1772 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1776 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1524 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1528 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1532 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1536 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1520 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1732 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1736 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1740 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1744 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1508 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1512 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1516 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1520 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1504 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1700 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1704 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1708 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1712 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1492 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1496 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1500 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1504 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1488 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1668 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1672 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1676 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1680 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1476 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1480 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1484 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1488 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1472 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1636 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1640 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1644 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1648 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1460 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1464 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1468 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1472 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1456 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1604 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1608 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1612 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1616 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1444 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1448 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1452 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1456 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1440 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1572 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1576 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1580 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1584 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1428 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1432 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1436 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1440 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1424 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1540 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1544 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1548 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1552 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1412 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1416 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1420 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1424 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1408 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1524 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1528 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1532 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1536 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1396 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1400 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1404 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1408 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1392 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1508 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1512 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1516 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1520 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1380 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1384 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1388 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1392 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1376 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1476 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1480 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1484 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1488 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1364 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1368 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1372 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1376 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1360 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1444 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1448 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1452 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1456 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1348 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1352 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1356 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1360 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1344 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1412 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1416 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1420 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1424 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1332 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1336 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1340 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1344 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1328 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1380 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1384 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1388 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1392 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1316 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1320 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1324 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1328 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1312 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1348 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1352 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1356 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1360 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1300 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1304 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1308 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1312 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1296 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1316 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1320 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1324 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1328 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1284 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1288 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1292 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1296 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1280 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1284 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1288 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1292 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1296 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1268 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1272 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1276 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1280 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1264 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1252 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1256 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1260 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1264 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1252 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1256 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1260 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1264 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1248 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1236 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1240 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1244 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1248 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1236 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1240 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1244 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1248 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1232 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1220 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1224 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1228 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1232 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1220 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1224 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1228 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1232 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1216 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1204 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1208 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1212 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1216 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1204 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1208 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1212 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1216 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1200 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1188 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1192 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1196 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1200 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1188 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1192 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1196 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1200 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1184 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1172 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1176 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1180 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1184 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1172 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1176 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1180 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1184 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1168 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1156 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1160 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1164 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1168 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1156 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1160 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1164 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1168 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1152 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1140 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1144 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1148 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1152 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1140 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1144 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1148 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1152 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1136 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1124 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1128 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1132 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1136 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1124 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1128 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1132 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1136 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1120 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1108 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1112 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1116 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1120 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1108 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1112 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1116 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1120 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1104 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1092 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1096 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1100 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1104 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1092 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1096 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1100 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1104 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1088 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1076 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1080 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1084 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1088 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1076 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1080 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1084 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1088 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1072 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1060 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1064 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1068 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1072 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1060 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1064 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1068 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1072 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1056 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1044 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1048 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1052 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1056 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1044 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1048 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1052 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1056 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1040 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1028 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1032 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1036 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1040 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1028 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1032 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1036 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1040 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1024 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:1012 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1016 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1020 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1024 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:1012 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1016 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1020 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1024 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1008 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:996 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:1000 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:1004 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:1008 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:996 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:1000 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:1004 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:1008 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:992 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:980 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:984 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:988 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:992 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:980 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:984 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:988 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:992 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:976 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:964 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:968 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:972 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:976 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:964 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:968 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:972 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:976 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:960 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:948 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:952 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:956 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:960 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:948 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:952 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:956 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:960 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:944 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:932 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:936 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:940 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:944 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:932 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:936 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:940 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:944 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:928 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:916 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:920 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:924 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:928 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:916 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:920 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:924 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:928 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:912 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:900 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:904 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:908 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:912 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:900 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:904 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:908 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:912 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:896 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:884 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:888 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:892 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:896 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:884 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:888 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:892 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:896 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:880 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:868 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:872 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:876 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:880 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:868 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:872 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:876 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:880 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:864 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:852 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:856 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:860 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:864 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:852 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:856 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:860 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:864 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:848 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:836 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:840 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:844 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:848 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:836 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:840 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:844 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:848 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:832 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:820 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:824 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:828 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:832 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:820 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:824 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:828 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:832 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:816 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:804 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:808 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:812 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:816 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:804 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:808 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:812 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:816 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:800 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:788 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:792 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:796 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:800 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:788 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:792 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:796 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:800 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:784 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:772 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:776 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:780 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:784 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:772 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:776 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:780 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:784 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:768 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:756 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:760 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:764 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:768 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:756 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:760 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:764 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:768 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:752 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:740 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:744 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:748 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:752 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:740 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:744 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:748 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:752 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:736 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:724 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:728 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:732 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:736 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:724 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:728 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:732 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:736 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:720 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:708 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:712 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:716 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:720 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:708 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:712 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:716 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:720 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:704 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:692 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:696 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:700 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:704 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:692 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:696 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:700 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:704 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:688 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:676 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:680 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:684 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:688 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:676 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:680 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:684 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:688 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:672 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:660 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:664 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:668 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:672 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:660 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:664 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:668 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:672 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:656 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:644 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:648 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:652 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:656 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:644 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:648 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:652 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:656 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:640 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:628 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:632 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:636 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:640 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:628 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:632 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:636 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:640 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:624 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:612 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:616 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:620 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:624 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:612 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:616 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:620 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:624 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:608 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:596 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:600 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:604 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:608 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:596 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:600 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:604 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:608 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:592 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:580 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:584 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:588 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:592 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:580 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:584 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:588 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:592 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:576 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:564 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:568 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:572 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:576 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:564 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:568 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:572 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:576 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:560 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:548 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:552 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:556 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:560 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:548 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:552 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:556 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:560 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:544 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:532 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:536 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:540 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:544 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:532 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:536 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:540 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:544 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:528 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:516 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:520 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:524 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:528 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:516 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:520 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:524 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:528 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:512 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:500 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:504 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:508 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:512 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:500 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:504 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:508 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:512 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:496 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:484 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:488 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:492 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:496 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:484 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:488 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:492 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:496 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:480 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:468 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:472 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:476 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:480 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:468 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:472 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:476 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:480 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:464 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:452 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:456 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:460 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:464 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:452 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:456 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:460 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:464 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:448 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:436 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:440 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:444 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:448 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:436 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:440 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:444 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:448 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:432 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:420 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:424 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:428 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:432 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:420 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:424 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:428 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:432 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:416 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:404 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:408 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:412 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:416 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:404 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:408 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:412 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:416 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:400 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:388 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:392 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:396 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:400 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:388 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:392 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:396 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:400 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:384 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:372 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:376 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:380 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:384 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:372 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:376 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:380 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:384 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:368 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:356 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:360 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:364 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:368 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:356 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:360 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:364 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:368 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:352 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:340 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:344 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:348 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:352 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:340 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:344 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:348 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:352 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:336 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:324 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:328 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:332 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:336 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:324 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:328 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:332 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:336 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:320 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:308 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:312 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:316 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:320 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:308 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:312 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:316 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:320 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:304 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:292 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:296 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:300 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:304 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:292 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:296 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:300 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:304 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:288 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:276 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:280 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:284 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:288 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:276 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:280 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:284 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:288 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:272 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:260 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:264 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:268 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:272 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:260 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:264 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:268 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:272 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:256 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:244 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:248 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:252 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:256 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:244 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:248 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:252 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:256 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:240 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:228 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:232 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:236 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:240 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:228 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:232 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:236 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:240 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:224 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:212 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:216 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:220 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:224 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:212 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:216 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:220 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:224 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:208 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:196 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:200 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:204 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:208 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:196 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:200 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:204 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:208 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:192 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:180 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:184 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:188 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:192 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:180 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:184 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:188 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:192 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:176 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:164 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:168 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:172 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:176 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:164 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:168 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:172 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:176 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:160 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:148 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:152 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:156 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:160 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:148 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:152 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:156 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:160 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:144 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:132 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:136 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:140 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:144 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:132 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:136 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:140 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:144 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:128 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:116 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:120 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:124 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:128 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:116 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:120 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:124 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:128 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:112 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:100 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:104 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:108 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:112 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:100 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:104 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:108 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:112 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:96 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:84 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:88 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:92 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:96 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:84 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:88 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:92 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:96 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:80 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:68 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:72 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:76 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:80 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:68 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:72 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:76 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:80 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:64 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:52 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:56 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:60 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:64 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:52 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:56 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:60 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:64 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:48 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:36 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:40 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:44 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:48 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:36 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:40 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:44 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:48 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:32 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:20 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:24 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:28 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:32 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:20 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:24 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:28 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:32 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:16 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[44:47], 0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[44:47], 0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[44:47], 0 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[44:47], 0 offset:16 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], 0 offset:16 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm @@ -5009,13 +5002,6 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968 ; GFX9-FLATSCR-NEXT: s_mov_b32 s4, 4 -; GFX9-FLATSCR-NEXT: s_movk_i32 s5, 0x84 -; GFX9-FLATSCR-NEXT: s_movk_i32 s6, 0x104 -; GFX9-FLATSCR-NEXT: s_movk_i32 s7, 0x184 -; GFX9-FLATSCR-NEXT: s_movk_i32 s8, 0x204 -; GFX9-FLATSCR-NEXT: s_movk_i32 s9, 0x284 -; GFX9-FLATSCR-NEXT: s_movk_i32 s10, 0x304 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x384 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3984 @@ -5050,1261 +5036,1268 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2 ; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x84 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s5 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3984 -; GFX9-FLATSCR-NEXT: s_movk_i32 s5, 0x94 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x94 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s5 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4000 -; GFX9-FLATSCR-NEXT: s_movk_i32 s5, 0xa4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s5 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4016 -; GFX9-FLATSCR-NEXT: s_movk_i32 s5, 0xb4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s5 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4032 -; GFX9-FLATSCR-NEXT: s_movk_i32 s5, 0xc4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s5 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4048 -; GFX9-FLATSCR-NEXT: s_movk_i32 s5, 0xd4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s5 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4064 -; GFX9-FLATSCR-NEXT: s_movk_i32 s5, 0xe4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s5 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4080 -; GFX9-FLATSCR-NEXT: s_movk_i32 s5, 0xf4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s5 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: s_movk_i32 s5, 0x180 -; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s5, v2 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x180 +; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2 ; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x104 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s6 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3984 -; GFX9-FLATSCR-NEXT: s_movk_i32 s6, 0x114 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x114 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s6 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4000 -; GFX9-FLATSCR-NEXT: s_movk_i32 s6, 0x124 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x124 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s6 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4016 -; GFX9-FLATSCR-NEXT: s_movk_i32 s6, 0x134 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x134 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s6 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4032 -; GFX9-FLATSCR-NEXT: s_movk_i32 s6, 0x144 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x144 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s6 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4048 -; GFX9-FLATSCR-NEXT: s_movk_i32 s6, 0x154 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x154 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s6 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4064 -; GFX9-FLATSCR-NEXT: s_movk_i32 s6, 0x164 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x164 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s6 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4080 -; GFX9-FLATSCR-NEXT: s_movk_i32 s6, 0x174 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x174 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s6 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: s_movk_i32 s6, 0x200 -; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x200 +; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2 ; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x184 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s7 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3984 -; GFX9-FLATSCR-NEXT: s_movk_i32 s7, 0x194 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x194 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s7 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4000 -; GFX9-FLATSCR-NEXT: s_movk_i32 s7, 0x1a4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1a4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s7 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4016 -; GFX9-FLATSCR-NEXT: s_movk_i32 s7, 0x1b4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1b4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s7 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4032 -; GFX9-FLATSCR-NEXT: s_movk_i32 s7, 0x1c4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1c4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s7 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4048 -; GFX9-FLATSCR-NEXT: s_movk_i32 s7, 0x1d4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1d4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s7 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4064 -; GFX9-FLATSCR-NEXT: s_movk_i32 s7, 0x1e4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1e4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s7 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4080 -; GFX9-FLATSCR-NEXT: s_movk_i32 s7, 0x1f4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1f4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s7 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: s_movk_i32 s7, 0x280 -; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s7, v2 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x280 +; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2 ; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x204 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s8 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3984 -; GFX9-FLATSCR-NEXT: s_movk_i32 s8, 0x214 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x214 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s8 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4000 -; GFX9-FLATSCR-NEXT: s_movk_i32 s8, 0x224 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x224 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s8 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4016 -; GFX9-FLATSCR-NEXT: s_movk_i32 s8, 0x234 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x234 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s8 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4032 -; GFX9-FLATSCR-NEXT: s_movk_i32 s8, 0x244 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x244 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s8 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4048 -; GFX9-FLATSCR-NEXT: s_movk_i32 s8, 0x254 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x254 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s8 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4064 -; GFX9-FLATSCR-NEXT: s_movk_i32 s8, 0x264 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x264 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s8 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4080 -; GFX9-FLATSCR-NEXT: s_movk_i32 s8, 0x274 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x274 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s8 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: s_movk_i32 s8, 0x300 -; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s8, v2 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x300 +; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2 ; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x284 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s9 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3984 -; GFX9-FLATSCR-NEXT: s_movk_i32 s9, 0x294 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x294 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s9 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4000 -; GFX9-FLATSCR-NEXT: s_movk_i32 s9, 0x2a4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x2a4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s9 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4016 -; GFX9-FLATSCR-NEXT: s_movk_i32 s9, 0x2b4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x2b4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s9 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4032 -; GFX9-FLATSCR-NEXT: s_movk_i32 s9, 0x2c4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x2c4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s9 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4048 -; GFX9-FLATSCR-NEXT: s_movk_i32 s9, 0x2d4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x2d4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s9 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4064 -; GFX9-FLATSCR-NEXT: s_movk_i32 s9, 0x2e4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x2e4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s9 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4080 -; GFX9-FLATSCR-NEXT: s_movk_i32 s9, 0x2f4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x2f4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s9 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: s_movk_i32 s9, 0x380 -; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s9, v2 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x380 +; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2 ; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x304 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s10 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3984 -; GFX9-FLATSCR-NEXT: s_movk_i32 s10, 0x314 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x314 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s10 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4000 -; GFX9-FLATSCR-NEXT: s_movk_i32 s10, 0x324 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x324 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s10 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4016 -; GFX9-FLATSCR-NEXT: s_movk_i32 s10, 0x334 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x334 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s10 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4032 -; GFX9-FLATSCR-NEXT: s_movk_i32 s10, 0x344 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x344 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s10 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4048 -; GFX9-FLATSCR-NEXT: s_movk_i32 s10, 0x354 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x354 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s10 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4064 -; GFX9-FLATSCR-NEXT: s_movk_i32 s10, 0x364 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x364 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s10 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4080 -; GFX9-FLATSCR-NEXT: s_movk_i32 s10, 0x374 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x374 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s10 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: s_movk_i32 s10, 0x400 -; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s10, v2 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x400 +; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2 ; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x384 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3984 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x394 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x394 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4000 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x3a4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x3a4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4016 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x3b4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x3b4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4032 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x3c4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x3c4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4048 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x3d4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x3d4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4064 ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:4080 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x3e4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x3e4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s11 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x3f4 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x3f4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x404 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x404 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, s1 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:16 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x414 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x414 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:32 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x424 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x424 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:48 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x434 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x434 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:64 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x444 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x444 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:80 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x454 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x454 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:96 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x464 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x464 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:112 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x474 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x474 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:128 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x484 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x484 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:144 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x494 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x494 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:160 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x4a4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x4a4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:176 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x4b4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x4b4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:192 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x4c4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x4c4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:208 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x4d4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x4d4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:224 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x4e4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x4e4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:240 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x4f4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x4f4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:256 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x504 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x504 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:272 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x514 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x514 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:288 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x524 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x524 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:304 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x534 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x534 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:320 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x544 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x544 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:336 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x554 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x554 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:352 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x564 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x564 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:368 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x574 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x574 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:384 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x584 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x584 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:400 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x594 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x594 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:416 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x5a4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x5a4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:432 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x5b4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x5b4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:448 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x5c4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x5c4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:464 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x5d4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x5d4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:480 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x5e4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x5e4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:496 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x5f4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x5f4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:512 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x604 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x604 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:528 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x614 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x614 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:544 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x624 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x624 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:560 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x634 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x634 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:576 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x644 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x644 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:592 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x654 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x654 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:608 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x664 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x664 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:624 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x674 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x674 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:640 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x684 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x684 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:656 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x694 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x694 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:672 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x6a4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x6a4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:688 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x6b4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x6b4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:704 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x6c4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x6c4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:720 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x6d4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x6d4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:736 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x6e4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x6e4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:752 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x6f4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x6f4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:768 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x704 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x704 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:784 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x714 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x714 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:800 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x724 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x724 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:816 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x734 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x734 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:832 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x744 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x744 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:848 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x754 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x754 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:864 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x764 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x764 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:880 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x774 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x774 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:896 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x784 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x784 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:912 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x794 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x794 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:928 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x7a4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x7a4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:944 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x7b4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x7b4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:960 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x7c4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x7c4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:976 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x7d4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x7d4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:992 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x7e4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x7e4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1008 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x7f4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x7f4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1024 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x804 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x804 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1040 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x814 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x814 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1056 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x824 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x824 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1072 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x834 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x834 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1088 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x844 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x844 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1104 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x854 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x854 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1120 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x864 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x864 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1136 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x874 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x874 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1152 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x884 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x884 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1168 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x894 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x894 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1184 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x8a4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x8a4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1200 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x8b4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x8b4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1216 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x8c4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x8c4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1232 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x8d4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x8d4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1248 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x8e4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x8e4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1264 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x8f4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x8f4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1280 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x904 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x904 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1296 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x914 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x914 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1312 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x924 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x924 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1328 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x934 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x934 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1344 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x944 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x944 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1360 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x954 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x954 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1376 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x964 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x964 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1392 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x974 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x974 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1408 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x984 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x984 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1424 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x994 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x994 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1440 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x9a4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x9a4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1456 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x9b4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x9b4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1472 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x9c4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x9c4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1488 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x9d4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x9d4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1504 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x9e4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x9e4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1520 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x9f4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x9f4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1536 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xa04 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa04 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1552 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xa14 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa14 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1568 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xa24 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa24 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1584 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xa34 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa34 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1600 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xa44 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa44 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1616 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xa54 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa54 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1632 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xa64 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa64 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1648 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xa74 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa74 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1664 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xa84 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa84 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1680 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xa94 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa94 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1696 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xaa4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xaa4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1712 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xab4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xab4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1728 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xac4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xac4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1744 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xad4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xad4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1760 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xae4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xae4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1776 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xaf4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xaf4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1792 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xb04 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb04 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1808 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xb14 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb14 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1824 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xb24 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb24 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1840 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xb34 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb34 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1856 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xb44 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb44 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1872 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xb54 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb54 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1888 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xb64 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb64 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1904 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xb74 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb74 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1920 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xb84 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb84 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1936 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xb94 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb94 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1952 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xba4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xba4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1968 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xbb4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xbb4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1984 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xbc4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xbc4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2000 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xbd4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xbd4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2016 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xbe4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xbe4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2032 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xbf4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xbf4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2048 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xc04 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc04 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2064 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xc14 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc14 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2080 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xc24 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc24 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2096 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xc34 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc34 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2112 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xc44 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc44 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2128 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xc54 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc54 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2144 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xc64 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc64 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2160 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xc74 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc74 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2176 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xc84 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc84 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2192 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xc94 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc94 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2208 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xca4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xca4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2224 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xcb4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xcb4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2240 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xcc4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xcc4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2256 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xcd4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xcd4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2272 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xce4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xce4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2288 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xcf4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xcf4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2304 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xd04 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd04 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2320 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xd14 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd14 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2336 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xd24 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd24 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2352 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xd34 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd34 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2368 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xd44 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd44 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2384 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xd54 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd54 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2400 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xd64 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd64 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2416 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xd74 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd74 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2432 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xd84 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd84 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2448 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xd94 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd94 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2464 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xda4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xda4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2480 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xdb4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xdb4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2496 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xdc4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xdc4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2512 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xdd4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xdd4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2528 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xde4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xde4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2544 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xdf4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xdf4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2560 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xe04 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe04 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2576 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xe14 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe14 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2592 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xe24 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe24 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2608 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xe34 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe34 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2624 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xe44 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe44 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2640 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xe54 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe54 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2656 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xe64 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe64 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2672 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xe74 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe74 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2688 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xe84 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe84 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2704 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xe94 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe94 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2720 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xea4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xea4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2736 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xeb4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xeb4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2752 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xec4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xec4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2768 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xed4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xed4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2784 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xee4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xee4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2800 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xef4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xef4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2816 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xf04 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf04 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2832 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xf14 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf14 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2848 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xf24 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf24 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2864 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xf34 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf34 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2880 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xf44 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf44 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2896 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xf54 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf54 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2912 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xf64 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf64 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2928 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xf74 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf74 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2944 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xf84 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf84 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2960 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xf94 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf94 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2976 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xfa4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xfa4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2992 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xfb4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xfb4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3008 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xfc4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xfc4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3024 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xfd4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xfd4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3040 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xfe4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xfe4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3056 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xff4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xff4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3072 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1004 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1004 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3088 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1014 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1014 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3104 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1024 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1024 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3120 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1034 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1034 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3136 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1044 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1044 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3152 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1054 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1054 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3168 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1064 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1064 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3184 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1074 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1074 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3200 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1084 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1084 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3216 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1094 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1094 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3232 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x10a4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x10a4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3248 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x10b4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x10b4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3264 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x10c4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x10c4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3280 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x10d4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x10d4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3296 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x10e4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x10e4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3312 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x10f4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x10f4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3328 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1104 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1104 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3344 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1114 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1114 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3360 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1124 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1124 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3376 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1134 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1134 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3392 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1144 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1144 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3408 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1154 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1154 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3424 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1164 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1164 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3440 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1174 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1174 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3456 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1184 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1184 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3472 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1194 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1194 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3488 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x11a4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x11a4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3504 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x11b4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x11b4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3520 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x11c4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x11c4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3536 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x11d4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x11d4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3552 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x11e4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x11e4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3568 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x11f4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x11f4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3584 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1204 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1204 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3600 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1214 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1214 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3616 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1224 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1224 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3632 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1234 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1234 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3648 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1244 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1244 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3664 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1254 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1254 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3680 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1264 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1264 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3696 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1274 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1274 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3712 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1284 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1284 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3728 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1294 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1294 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3744 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x12a4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x12a4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3760 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x12b4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x12b4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3776 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x12c4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x12c4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3792 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x12d4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x12d4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3808 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x12e4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x12e4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3824 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x12f4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x12f4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3840 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1304 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1304 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3856 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1314 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1314 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3872 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1324 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1324 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3888 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1334 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1334 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3904 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1344 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1344 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3920 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1354 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1354 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3936 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1364 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1364 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3952 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1374 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1374 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3968 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1384 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1384 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3984 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1394 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1394 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:4000 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x13a4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x13a4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:4016 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x13b4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x13b4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:4032 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x13c4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x13c4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:4048 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x13d4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x13d4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:4064 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x13e4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x13e4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:4080 ; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x13e4 ; GFX9-FLATSCR-NEXT: ;;#ASMSTART @@ -7346,7 +7339,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x3f4 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s10, v4 +; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, 0x400, v4 ; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x3e4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -7380,7 +7373,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:3968 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s9, v4 +; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, 0x380, v4 ; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x364 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -7414,7 +7407,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:3968 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s8, v4 +; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, 0x300, v4 ; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2e4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -7448,7 +7441,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:3968 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s7, v4 +; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, 0x280, v4 ; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x264 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -7482,7 +7475,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:3968 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 +; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, 0x200, v4 ; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1e4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -7516,7 +7509,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:3968 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s5, v4 +; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, 0x180, v4 ; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x164 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -7550,7 +7543,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:3968 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 +; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v4 ; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll index 2f43cc022afd3..fbe0b156cd9ba 100644 --- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll +++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll @@ -15,126 +15,126 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK: bb.0..expVert: ; CHECK-NEXT: liveins: $sgpr3, $sgpr4, $sgpr5, $sgpr8, $sgpr9, $sgpr10, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr25, $sgpr27, $sgpr31 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: undef %56.sub0:sgpr_64 = COPY $sgpr31 - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr27 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr25 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr5 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr18 - ; CHECK-NEXT: undef %50.sub0:sgpr_64 = COPY $sgpr19 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr20 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr21 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr22 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr23 - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr9 - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr10 - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr8 - ; CHECK-NEXT: undef %71.sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM %56, 232, 0 :: (invariant load (s64) from %ir.39, addrspace 4) - ; CHECK-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 4, implicit-def dead $scc - ; CHECK-NEXT: [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 4, implicit-def dead $scc - ; CHECK-NEXT: [[S_LSHL_B32_2:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 4, implicit-def dead $scc + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:sgpr_64 = COPY $sgpr31 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr27 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr25 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr5 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr18 + ; CHECK-NEXT: undef [[COPY7:%[0-9]+]].sub0:sgpr_64 = COPY $sgpr19 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr20 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr21 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr22 + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr23 + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr9 + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr10 + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:sgpr_32 = COPY $sgpr8 + ; CHECK-NEXT: undef [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM [[COPY]], 232, 0 :: (invariant load (s64) from %ir.39, addrspace 4) + ; CHECK-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 4, implicit-def dead $scc + ; CHECK-NEXT: [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 4, implicit-def dead $scc + ; CHECK-NEXT: [[S_LSHL_B32_2:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 4, implicit-def dead $scc ; CHECK-NEXT: [[S_ASHR_I32_:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_]], 31, implicit-def dead $scc ; CHECK-NEXT: [[S_ASHR_I32_1:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_1]], 31, implicit-def dead $scc ; CHECK-NEXT: [[S_ASHR_I32_2:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_2]], 31, implicit-def dead $scc - ; CHECK-NEXT: %71.sub1:sgpr_128 = S_AND_B32 %71.sub1, 65535, implicit-def dead $scc - ; CHECK-NEXT: undef %130.sub0:sreg_64 = S_ADD_U32 [[COPY5]], [[S_LSHL_B32_2]], implicit-def $scc - ; CHECK-NEXT: %130.sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %130, 16, 0 :: (invariant load (s128) from %ir.81, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub1:sgpr_128 = S_AND_B32 [[S_LOAD_DWORDX2_IMM]].sub1, 65535, implicit-def dead $scc + ; CHECK-NEXT: undef [[S_ADD_U32_:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_]], 16, 0 :: (invariant load (s128) from %ir.81, addrspace 4) ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM undef %74:sreg_64, 0, 0 :: (invariant load (s128) from `ptr addrspace(4) undef`, addrspace 4) ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %132:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: KILL undef %74:sreg_64 ; CHECK-NEXT: KILL undef %132:sgpr_128 - ; CHECK-NEXT: KILL %130.sub0, %130.sub1 + ; CHECK-NEXT: KILL [[S_ADD_U32_]].sub0, [[S_ADD_U32_]].sub1 ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_LOAD_DWORDX4_IMM]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK-NEXT: undef %302.sub1:sgpr_128 = S_MOV_B32 0 + ; CHECK-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = S_MOV_B32 0 ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %89:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: KILL undef %89:sgpr_128 ; CHECK-NEXT: [[S_SUB_I32_:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM]], 29, implicit-def dead $scc ; CHECK-NEXT: [[S_SUB_I32_1:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM]], 30, implicit-def dead $scc ; CHECK-NEXT: [[S_SUB_I32_2:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM1]], 31, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY5]], 64, implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_1:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY6]], 64, implicit-def $scc ; CHECK-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %54:sreg_32, 0, implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef %149.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], [[S_LSHL_B32_]], implicit-def $scc - ; CHECK-NEXT: %149.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef %156.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], [[S_LSHL_B32_1]], implicit-def $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %149, 0, 0 :: (invariant load (s128) from %ir.87, addrspace 4) - ; CHECK-NEXT: %156.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef %163.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], [[S_LSHL_B32_2]], implicit-def $scc - ; CHECK-NEXT: %163.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_2:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_1]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_2:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_3:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_1]], [[S_LSHL_B32_1]], implicit-def $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_2]], 0, 0 :: (invariant load (s128) from %ir.87, addrspace 4) + ; CHECK-NEXT: [[S_ADD_U32_3:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_4:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_1]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_4:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: [[S_ASHR_I32_3:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 undef %171:sreg_32, 31, implicit-def dead $scc - ; CHECK-NEXT: undef %176.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], undef %171:sreg_32, implicit-def $scc - ; CHECK-NEXT: %176.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef %183.sub0:sreg_64 = S_ADD_U32 %50.sub0, [[S_LSHL_B32_]], implicit-def $scc - ; CHECK-NEXT: %183.sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef %190.sub0:sreg_64 = S_ADD_U32 %50.sub0, [[S_LSHL_B32_1]], implicit-def $scc - ; CHECK-NEXT: %190.sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef %200.sub0:sreg_64 = S_ADD_U32 %50.sub0, undef %171:sreg_32, implicit-def $scc - ; CHECK-NEXT: %200.sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_ADD_U32_1:%[0-9]+]]:sreg_32 = S_ADD_U32 %50.sub0, 224, implicit-def $scc + ; CHECK-NEXT: undef [[S_ADD_U32_5:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_1]], undef %171:sreg_32, implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_5:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_6:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_6:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_7:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_1]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_7:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_8:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, undef %171:sreg_32, implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_8:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_ADD_U32_9:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY7]].sub0, 224, implicit-def $scc ; CHECK-NEXT: [[S_ADDC_U32_1:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %51:sreg_32, 0, implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef %210.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_1]], [[S_LSHL_B32_]], implicit-def $scc - ; CHECK-NEXT: %210.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_1]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef %217.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_1]], [[S_LSHL_B32_1]], implicit-def $scc - ; CHECK-NEXT: %217.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_1]], [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef %224.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_1]], [[S_LSHL_B32_2]], implicit-def $scc - ; CHECK-NEXT: %224.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_1]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_ADD_U32_2:%[0-9]+]]:sreg_32 = S_ADD_U32 %50.sub0, 576, implicit-def $scc + ; CHECK-NEXT: undef [[S_ADD_U32_10:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_9]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_10:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_1]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_11:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_9]], [[S_LSHL_B32_1]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_11:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_1]], [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_12:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_9]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_12:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_1]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_ADD_U32_13:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY7]].sub0, 576, implicit-def $scc ; CHECK-NEXT: [[S_ADDC_U32_2:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %51:sreg_32, 0, implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef %241.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_2]], [[S_LSHL_B32_]], implicit-def $scc - ; CHECK-NEXT: %241.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_2]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef %253.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_2]], [[S_LSHL_B32_2]], implicit-def $scc - ; CHECK-NEXT: %253.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_2]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef %261.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_2]], undef %171:sreg_32, implicit-def $scc - ; CHECK-NEXT: %261.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_2]], [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef %273.sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_]], implicit-def $scc - ; CHECK-NEXT: %273.sub1:sreg_64 = S_ADDC_U32 undef %48:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef %286.sub0:sreg_64 = S_ADD_U32 [[COPY7]], [[S_LSHL_B32_1]], implicit-def $scc - ; CHECK-NEXT: %286.sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef %293.sub0:sreg_64 = S_ADD_U32 [[COPY7]], [[S_LSHL_B32_2]], implicit-def $scc - ; CHECK-NEXT: %293.sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_14:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_13]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_14:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_2]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_15:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_13]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_15:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_2]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_16:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_13]], undef %171:sreg_32, implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_16:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_2]], [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_17:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_17:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %48:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_18:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_1]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_18:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_19:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_19:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_]], 16, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_2]], 16, implicit-def dead $scc - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM %302, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM %302, undef %314:sreg_32, 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM %302, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %302, 16, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], undef %312:sreg_32, 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_MOV_B32_]], 16, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %118:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %369:sgpr_128, undef %370:sreg_32, 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %380:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %156, 0, 0 :: (invariant load (s128) from %ir.92, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %163, 0, 0 :: (invariant load (s128) from %ir.97, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %176, 0, 0 :: (invariant load (s128) from %ir.104, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %183, 0, 0 :: (invariant load (s128) from %ir.109, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %190, 0, 0 :: (invariant load (s128) from %ir.114, addrspace 4) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %367:sgpr_128, undef %368:sreg_32, 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %378:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 0, 0 :: (invariant load (s128) from %ir.92, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 0, 0 :: (invariant load (s128) from %ir.97, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 0, 0 :: (invariant load (s128) from %ir.104, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 0, 0 :: (invariant load (s128) from %ir.109, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 0, 0 :: (invariant load (s128) from %ir.114, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %364:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %375:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %362:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %373:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM]], -98, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM1]], -114, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_4:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM2]], -130, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_5:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM2]], -178, implicit-def dead $scc - ; CHECK-NEXT: undef %327.sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_]], implicit-def $scc - ; CHECK-NEXT: %327.sub1:sreg_64 = S_ADDC_U32 undef %42:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef %335.sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_]], implicit-def $scc - ; CHECK-NEXT: %335.sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef %343.sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_1]], implicit-def $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %200, 0, 0 :: (invariant load (s128) from %ir.121, addrspace 4) - ; CHECK-NEXT: %343.sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef %351.sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_2]], implicit-def $scc - ; CHECK-NEXT: %351.sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY10]], 4, implicit-def dead $scc + ; CHECK-NEXT: undef [[S_ADD_U32_20:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY10]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_20:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %42:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_21:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_21:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_22:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_1]], implicit-def $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 0, 0 :: (invariant load (s128) from %ir.121, addrspace 4) + ; CHECK-NEXT: [[S_ADD_U32_22:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_23:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_23:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY12]], 4, implicit-def dead $scc ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_3]], 16, implicit-def dead $scc - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %396:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %394:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %50, 224, 0 :: (invariant load (s128) from %ir.126, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %210, 0, 0 :: (invariant load (s128) from %ir.127, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY7]], 224, 0 :: (invariant load (s128) from %ir.126, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_10]], 0, 0 :: (invariant load (s128) from %ir.127, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %217, 0, 0 :: (invariant load (s128) from %ir.132, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %224, 0, 0 :: (invariant load (s128) from %ir.137, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_11]], 0, 0 :: (invariant load (s128) from %ir.132, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_12]], 0, 0 :: (invariant load (s128) from %ir.137, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM6]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM7]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) @@ -146,119 +146,119 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[S_ADD_I32_12:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -329, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_13:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -345, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_14:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM6]], -441, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_U32_3:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], 160, implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_24:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY2]], 160, implicit-def $scc ; CHECK-NEXT: [[S_ADDC_U32_3:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %36:sreg_32, 0, implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef %411.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_3]], [[S_LSHL_B32_2]], implicit-def $scc - ; CHECK-NEXT: %411.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_3]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LSHL_B32_4:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY11]], 4, implicit-def dead $scc + ; CHECK-NEXT: undef [[S_ADD_U32_25:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_24]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_25:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_3]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LSHL_B32_4:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY13]], 4, implicit-def dead $scc ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_ASHR_I32_4:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_4]], 31, implicit-def dead $scc - ; CHECK-NEXT: undef %425.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_3]], [[S_LSHL_B32_4]], implicit-def $scc - ; CHECK-NEXT: %425.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_3]], [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_ADD_U32_4:%[0-9]+]]:sreg_32 = S_ADD_U32 %56.sub0, 168, implicit-def $scc + ; CHECK-NEXT: undef [[S_ADD_U32_26:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_24]], [[S_LSHL_B32_4]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_26:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_3]], [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_ADD_U32_27:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]].sub0, 168, implicit-def $scc ; CHECK-NEXT: [[S_ADDC_U32_4:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %57:sreg_32, 0, implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %241, 0, 0 :: (invariant load (s128) from %ir.147, addrspace 4) - ; CHECK-NEXT: [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 3, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_14]], 0, 0 :: (invariant load (s128) from %ir.147, addrspace 4) + ; CHECK-NEXT: [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 3, implicit-def dead $scc ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_5]], 31, implicit-def dead $scc - ; CHECK-NEXT: undef %441.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_4]], [[S_LSHL_B32_5]], implicit-def $scc - ; CHECK-NEXT: %441.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %441, 0, 0 :: (invariant load (s32) from %ir.269, align 8, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %253, 0, 0 :: (invariant load (s128) from %ir.154, addrspace 4) + ; CHECK-NEXT: undef [[S_ADD_U32_28:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_27]], [[S_LSHL_B32_5]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_28:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_28]], 0, 0 :: (invariant load (s32) from %ir.269, align 8, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_15]], 0, 0 :: (invariant load (s128) from %ir.154, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM12]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %261, 0, 0 :: (invariant load (s128) from %ir.159, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 0, 0 :: (invariant load (s128) from %ir.159, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM13]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: %71.sub3:sgpr_128 = S_MOV_B32 553734060 - ; CHECK-NEXT: %71.sub2:sgpr_128 = S_MOV_B32 -1 - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:sgpr_128 = COPY %71 - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %273, 0, 0 :: (invariant load (s128) from %ir.167, addrspace 4) - ; CHECK-NEXT: [[COPY13]].sub1:sgpr_128 = COPY %302.sub1 - ; CHECK-NEXT: [[COPY13]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM]] - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY13]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub3:sgpr_128 = S_MOV_B32 553734060 + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 -1 + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]] + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_17]], 0, 0 :: (invariant load (s128) from %ir.167, addrspace 4) + ; CHECK-NEXT: [[COPY15:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub1 + ; CHECK-NEXT: [[COPY15:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM]] + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY15]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM15]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %286, 0, 0 :: (invariant load (s128) from %ir.175, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %293, 0, 0 :: (invariant load (s128) from %ir.180, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_18]], 0, 0 :: (invariant load (s128) from %ir.175, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_19]], 0, 0 :: (invariant load (s128) from %ir.180, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN16:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM16]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 3, implicit-def dead $scc + ; CHECK-NEXT: [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 3, implicit-def dead $scc ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_ASHR_I32_6:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_6]], 31, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_15:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM4]], -467, implicit-def dead $scc - ; CHECK-NEXT: undef %453.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_4]], [[S_LSHL_B32_6]], implicit-def $scc - ; CHECK-NEXT: %453.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM %453, 0, 0 :: (invariant load (s64) from %ir.277, addrspace 4) + ; CHECK-NEXT: undef [[S_ADD_U32_29:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_27]], [[S_LSHL_B32_6]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_29:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_29]], 0, 0 :: (invariant load (s64) from %ir.277, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM18]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %327, 0, 0 :: (invariant load (s128) from %ir.202, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %335, 0, 0 :: (invariant load (s128) from %ir.208, addrspace 4) - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:sgpr_128 = COPY %71 - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %343, 0, 0 :: (invariant load (s128) from %ir.213, addrspace 4) - ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM]].sub1, 65535, implicit-def dead $scc - ; CHECK-NEXT: [[COPY14]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]].sub0 - ; CHECK-NEXT: [[COPY14]].sub1:sgpr_128 = COPY [[S_AND_B32_]] - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY14]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %351, 0, 0 :: (invariant load (s128) from %ir.218, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_20]], 0, 0 :: (invariant load (s128) from %ir.202, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_21]], 0, 0 :: (invariant load (s128) from %ir.208, addrspace 4) + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]] + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_22]], 0, 0 :: (invariant load (s128) from %ir.213, addrspace 4) + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM1]].sub1, 65535, implicit-def dead $scc + ; CHECK-NEXT: [[COPY16:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0 + ; CHECK-NEXT: [[COPY16:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_]] + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY16]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_23]], 0, 0 :: (invariant load (s128) from %ir.218, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN17:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM19]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN18:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM20]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_LSHL_B32_7:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 3, implicit-def dead $scc + ; CHECK-NEXT: [[S_LSHL_B32_7:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 3, implicit-def dead $scc ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN19:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM21]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_ASHR_I32_7:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_7]], 31, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_16:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM5]], -468, implicit-def dead $scc - ; CHECK-NEXT: undef %468.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_4]], [[S_LSHL_B32_7]], implicit-def $scc - ; CHECK-NEXT: %468.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_30:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_27]], [[S_LSHL_B32_7]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_30:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN20:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM22]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM %468, 0, 0 :: (invariant load (s64) from %ir.287, addrspace 4) - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:sgpr_128 = COPY %71 - ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM1]].sub1, 65535, implicit-def dead $scc - ; CHECK-NEXT: [[COPY15]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0 - ; CHECK-NEXT: [[COPY15]].sub1:sgpr_128 = COPY [[S_AND_B32_1]] - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY15]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %411, 0, 0 :: (invariant load (s128) from %ir.253, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %488:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) undef`, addrspace 4) - ; CHECK-NEXT: KILL %411.sub0, %411.sub1 - ; CHECK-NEXT: KILL undef %488:sreg_64 - ; CHECK-NEXT: KILL [[COPY15]].sub0_sub1_sub2, [[COPY15]].sub3 - ; CHECK-NEXT: [[S_LSHL_B32_8:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY12]], 3, implicit-def dead $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %425, 0, 0 :: (invariant load (s128) from %ir.261, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_30]], 0, 0 :: (invariant load (s64) from %ir.287, addrspace 4) + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]] + ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM2]].sub1, 65535, implicit-def dead $scc + ; CHECK-NEXT: [[COPY17:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM2]].sub0 + ; CHECK-NEXT: [[COPY17:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_1]] + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY17]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_25]], 0, 0 :: (invariant load (s128) from %ir.253, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %484:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) undef`, addrspace 4) + ; CHECK-NEXT: KILL [[S_ADD_U32_25]].sub0, [[S_ADD_U32_25]].sub1 + ; CHECK-NEXT: KILL undef %484:sreg_64 + ; CHECK-NEXT: KILL [[COPY17]].sub0_sub1_sub2, [[COPY17]].sub3 + ; CHECK-NEXT: [[S_LSHL_B32_8:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY14]], 3, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_26]], 0, 0 :: (invariant load (s128) from %ir.261, addrspace 4) ; CHECK-NEXT: [[S_ASHR_I32_8:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_8]], 31, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_17:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM6]], -469, implicit-def dead $scc - ; CHECK-NEXT: undef %485.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_4]], [[S_LSHL_B32_8]], implicit-def $scc - ; CHECK-NEXT: %485.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %485, 0, 0 :: (invariant load (s32) from %ir.298, align 8, addrspace 4) + ; CHECK-NEXT: undef [[S_ADD_U32_31:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_27]], [[S_LSHL_B32_8]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_31:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_31]], 0, 0 :: (invariant load (s32) from %ir.298, align 8, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM24]] ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM23]] ; CHECK-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORD_IMM1]], 65535, implicit-def dead $scc - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:sgpr_128 = COPY %71 - ; CHECK-NEXT: [[COPY16]].sub1:sgpr_128 = COPY [[S_AND_B32_2]] - ; CHECK-NEXT: [[COPY16]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM2]] - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM7:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY16]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]] + ; CHECK-NEXT: [[COPY18:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_2]] + ; CHECK-NEXT: [[COPY18:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM2]] + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM7:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY18]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[S_ADD_I32_18:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM]], -474, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_19:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -475, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_20:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -491, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_21:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -507, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_22:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -539, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_23:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM7]], -473, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_U32_5:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]], 96, implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_32:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], 96, implicit-def $scc ; CHECK-NEXT: [[S_ADDC_U32_5:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %33:sreg_32, 0, implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef %514.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_5]], [[S_LSHL_B32_]], implicit-def $scc - ; CHECK-NEXT: %514.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_5]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %514, 0, 0 :: (invariant load (s128) from %ir.316, addrspace 4) - ; CHECK-NEXT: undef %522.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_5]], [[S_LSHL_B32_1]], implicit-def $scc - ; CHECK-NEXT: %522.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_5]], [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %522, 0, 0 :: (invariant load (s128) from %ir.321, addrspace 4) - ; CHECK-NEXT: undef %530.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_5]], [[S_LSHL_B32_2]], implicit-def $scc - ; CHECK-NEXT: %530.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_5]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %530, 0, 0 :: (invariant load (s128) from %ir.326, addrspace 4) + ; CHECK-NEXT: undef [[S_ADD_U32_33:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_32]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_33:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_5]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_33]], 0, 0 :: (invariant load (s128) from %ir.316, addrspace 4) + ; CHECK-NEXT: undef [[S_ADD_U32_34:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_32]], [[S_LSHL_B32_1]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_34:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_5]], [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_34]], 0, 0 :: (invariant load (s128) from %ir.321, addrspace 4) + ; CHECK-NEXT: undef [[S_ADD_U32_35:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_32]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_35:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_5]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_35]], 0, 0 :: (invariant load (s128) from %ir.326, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM27]] ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM25]] ; CHECK-NEXT: KILL [[V_MOV_B32_e32_]] ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM26]] + ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM27]] ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -2, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -1, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -3, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec @@ -363,20 +363,20 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[V_OR_B32_e64_61:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_60]], [[V_ADD_U32_e64_25]], implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -575, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_62:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_61]], [[V_ADD_U32_e64_26]], implicit $exec - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM8:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %71, 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM8:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_LOAD_DWORDX2_IMM]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[V_ADD_U32_e64_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -576, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_63:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_62]], [[V_ADD_U32_e64_27]], implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -577, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_64:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_63]], [[V_ADD_U32_e64_28]], implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -593, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_65:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_64]], [[V_ADD_U32_e64_29]], implicit $exec - ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %564:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) undef`, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %559:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) undef`, addrspace 4) ; CHECK-NEXT: [[V_OR_B32_e64_66:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_65]], [[V_ADD_U32_e64_30]], implicit $exec ; CHECK-NEXT: [[S_ADD_I32_24:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM8]], -594, implicit-def dead $scc ; CHECK-NEXT: [[V_OR_B32_e64_67:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_24]], [[V_OR_B32_e64_66]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 0, [[V_OR_B32_e64_67]], implicit $exec - ; CHECK-NEXT: undef %624.sub3:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_EQ_U32_e64_]], implicit $exec - ; CHECK-NEXT: IMAGE_STORE_V4_V2_gfx10 %624, undef %578:vreg_64, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8) + ; CHECK-NEXT: undef [[V_CNDMASK_B32_e64_:%[0-9]+]].sub3:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_EQ_U32_e64_]], implicit $exec + ; CHECK-NEXT: IMAGE_STORE_V4_V2_gfx10 [[V_CNDMASK_B32_e64_]], undef %573:vreg_64, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 .expVert: %0 = extractelement <31 x i32> %userData, i64 2 diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index 24319a639da44..520ec6e24ae3b 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -1779,30 +1779,29 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_addc_u32_e64 v4, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[3:4] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[3:4] ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0x8000 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[3:4] ; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v5, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 -; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB12_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v3 -; GCN-IR-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc ; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v3 +; GCN-IR-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc +; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[7:8] -; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[4:5], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execz .LBB12_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 -; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc ; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v7 ; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v6 diff --git a/llvm/test/CodeGen/AMDGPU/swdev380865.ll b/llvm/test/CodeGen/AMDGPU/swdev380865.ll index 6c98c7def2328..9189cef019cf4 100644 --- a/llvm/test/CodeGen/AMDGPU/swdev380865.ll +++ b/llvm/test/CodeGen/AMDGPU/swdev380865.ll @@ -16,94 +16,43 @@ define amdgpu_kernel void @_Z6kernelILi4000ELi1EEvPd(ptr addrspace(1) %x.coerce) ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_mov_b64 s[0:1], 0 ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x0 -; CHECK-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane -; CHECK-NEXT: ; kill: killed $sgpr0_sgpr1 -; CHECK-NEXT: s_mov_b32 s7, 0x401c0000 -; CHECK-NEXT: s_mov_b32 s5, 0x40280000 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_writelane_b32 v2, s2, 0 -; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: s_mov_b32 s0, 0 -; CHECK-NEXT: s_mov_b32 s1, 0x40140000 -; CHECK-NEXT: s_mov_b32 s1, 0x40180000 -; CHECK-NEXT: v_writelane_b32 v2, s0, 1 -; CHECK-NEXT: v_writelane_b32 v2, s1, 2 -; CHECK-NEXT: s_mov_b32 s1, 0x40220000 -; CHECK-NEXT: v_writelane_b32 v2, s0, 3 -; CHECK-NEXT: v_writelane_b32 v2, s1, 4 -; CHECK-NEXT: s_mov_b32 s1, 0x40240000 -; CHECK-NEXT: v_writelane_b32 v2, s0, 5 -; CHECK-NEXT: v_writelane_b32 v2, s1, 6 -; CHECK-NEXT: s_mov_b32 s1, 0x40260000 -; CHECK-NEXT: v_writelane_b32 v2, s0, 7 +; CHECK-NEXT: s_mov_b32 s5, 0x40280000 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_writelane_b32 v2, s1, 8 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-NEXT: s_mov_b32 s1, s2 +; CHECK-NEXT: s_mov_b32 s2, 0 +; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: s_mov_b32 s3, 0x40260000 +; CHECK-NEXT: v_mov_b32_e32 v1, s7 ; CHECK-NEXT: .LBB0_1: ; %for.cond4.preheader ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], 0 -; CHECK-NEXT: s_mov_b32 s2, 0 -; CHECK-NEXT: s_mov_b32 s3, 0x40140000 -; CHECK-NEXT: v_writelane_b32 v2, s0, 9 -; CHECK-NEXT: v_writelane_b32 v2, s6, 10 -; CHECK-NEXT: v_writelane_b32 v2, s7, 11 -; CHECK-NEXT: v_readlane_b32 s6, v2, 1 -; CHECK-NEXT: v_readlane_b32 s7, v2, 2 -; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] -; CHECK-NEXT: s_mov_b32 s1, s7 -; CHECK-NEXT: s_mov_b32 s0, s2 -; CHECK-NEXT: v_writelane_b32 v2, s6, 1 -; CHECK-NEXT: v_writelane_b32 v2, s7, 2 -; CHECK-NEXT: v_readlane_b32 s6, v2, 10 -; CHECK-NEXT: v_readlane_b32 s7, v2, 11 -; CHECK-NEXT: s_mov_b32 s6, s2 -; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[0:1] -; CHECK-NEXT: v_readlane_b32 s0, v2, 3 -; CHECK-NEXT: v_readlane_b32 s1, v2, 4 -; CHECK-NEXT: s_mov_b32 s3, s1 -; CHECK-NEXT: s_mov_b32 s0, 0 -; CHECK-NEXT: s_mov_b32 s1, 0x40140000 -; CHECK-NEXT: s_mov_b32 s2, s0 -; CHECK-NEXT: s_mov_b32 s1, s3 +; CHECK-NEXT: s_mov_b32 s6, 0 +; CHECK-NEXT: s_mov_b32 s7, 0x40140000 +; CHECK-NEXT: s_add_i32 s0, s0, s1 +; CHECK-NEXT: s_cmpk_lt_i32 s0, 0xa00 +; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[6:7] +; CHECK-NEXT: s_mov_b32 s6, 0 +; CHECK-NEXT: s_mov_b32 s7, 0x40180000 +; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[6:7] +; CHECK-NEXT: s_mov_b32 s6, 0 +; CHECK-NEXT: s_mov_b32 s7, 0x401c0000 +; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[6:7] +; CHECK-NEXT: s_mov_b32 s6, 0 +; CHECK-NEXT: s_mov_b32 s7, 0x40220000 +; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[6:7] +; CHECK-NEXT: s_mov_b32 s6, 0 +; CHECK-NEXT: s_mov_b32 s7, 0x40240000 ; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[6:7] -; CHECK-NEXT: v_writelane_b32 v2, s0, 3 -; CHECK-NEXT: v_writelane_b32 v2, s1, 4 -; CHECK-NEXT: v_readlane_b32 s0, v2, 5 -; CHECK-NEXT: v_readlane_b32 s1, v2, 6 -; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] -; CHECK-NEXT: s_mov_b32 s3, s1 -; CHECK-NEXT: s_mov_b32 s0, 0 -; CHECK-NEXT: s_mov_b32 s1, 0x40140000 -; CHECK-NEXT: s_mov_b32 s2, s0 -; CHECK-NEXT: s_mov_b32 s1, s3 -; CHECK-NEXT: v_writelane_b32 v2, s0, 5 -; CHECK-NEXT: v_writelane_b32 v2, s1, 6 -; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] -; CHECK-NEXT: v_readlane_b32 s0, v2, 7 -; CHECK-NEXT: v_readlane_b32 s1, v2, 8 -; CHECK-NEXT: s_mov_b32 s3, s1 -; CHECK-NEXT: s_mov_b32 s0, 0 -; CHECK-NEXT: s_mov_b32 s1, 0x40140000 -; CHECK-NEXT: s_mov_b32 s2, s0 -; CHECK-NEXT: s_mov_b32 s1, s3 ; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] -; CHECK-NEXT: v_writelane_b32 v2, s0, 7 -; CHECK-NEXT: s_mov_b32 s4, s0 -; CHECK-NEXT: v_writelane_b32 v2, s1, 8 -; CHECK-NEXT: v_readlane_b32 s0, v2, 0 -; CHECK-NEXT: v_readlane_b32 s2, v2, 9 -; CHECK-NEXT: s_add_i32 s2, s2, s0 -; CHECK-NEXT: v_writelane_b32 v2, s2, 9 ; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[4:5] -; CHECK-NEXT: v_readlane_b32 s0, v2, 9 -; CHECK-NEXT: s_cmpk_lt_i32 s0, 0xa00 ; CHECK-NEXT: s_cbranch_scc1 .LBB0_1 ; CHECK-NEXT: ; %bb.2: ; %for.cond.cleanup.loopexit +; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: v_mov_b32_e32 v3, 0 -; CHECK-NEXT: v_mov_b32_e32 v4, 0 -; CHECK-NEXT: global_store_dwordx2 v[3:4], v[0:1], off -; CHECK-NEXT: ; kill: killed $vgpr2 +; CHECK-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; CHECK-NEXT: s_endpgm entry: %0 = load i32, ptr addrspace(4) null, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll index dcf49de684924..05de0bc5f282a 100644 --- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll +++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll @@ -39,68 +39,68 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 ; GLOBALNESS1-NEXT: global_store_dword v[0:1], v42, off ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS1-NEXT: global_load_dword v0, v42, s[76:77] +; GLOBALNESS1-NEXT: global_load_dword v2, v42, s[76:77] ; GLOBALNESS1-NEXT: s_mov_b64 s[40:41], s[4:5] ; GLOBALNESS1-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 ; GLOBALNESS1-NEXT: s_load_dword s7, s[8:9], 0x20 ; GLOBALNESS1-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GLOBALNESS1-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GLOBALNESS1-NEXT: s_add_u32 s0, s0, s17 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0 ; GLOBALNESS1-NEXT: s_addc_u32 s1, s1, 0 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, 0x40994400 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v1, 0x40994400 ; GLOBALNESS1-NEXT: s_bitcmp1_b32 s78, 0 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e32 vcc, s[4:5], v[42:43] +; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e32 vcc, s[4:5], v[0:1] ; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] ; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] ; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GLOBALNESS1-NEXT: s_bitcmp1_b32 s6, 0 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[42:43], 1, v1 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[42:43], 1, v0 +; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GLOBALNESS1-NEXT: s_bitcmp1_b32 s7, 0 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v2 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[48:49], 1, v0 +; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS1-NEXT: s_getpc_b64 s[6:7] ; GLOBALNESS1-NEXT: s_add_u32 s6, s6, wobble@gotpcrel32@lo+4 ; GLOBALNESS1-NEXT: s_addc_u32 s7, s7, wobble@gotpcrel32@hi+12 ; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[48:49], 1, v2 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] -; GLOBALNESS1-NEXT: s_load_dwordx2 s[76:77], s[6:7], 0x0 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v2 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v0 +; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GLOBALNESS1-NEXT: s_load_dwordx2 s[74:75], s[6:7], 0x0 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[52:53], 1, v0 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v1 ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[46:47], 1, v3 ; GLOBALNESS1-NEXT: s_mov_b32 s70, s16 ; GLOBALNESS1-NEXT: s_mov_b64 s[38:39], s[8:9] ; GLOBALNESS1-NEXT: s_mov_b32 s71, s15 ; GLOBALNESS1-NEXT: s_mov_b32 s72, s14 ; GLOBALNESS1-NEXT: s_mov_b64 s[34:35], s[10:11] -; GLOBALNESS1-NEXT: s_mov_b64 s[74:75], 0x80 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[60:61], 1, v1 ; GLOBALNESS1-NEXT: s_mov_b32 s32, 0 ; GLOBALNESS1-NEXT: ; implicit-def: $vgpr44_vgpr45 ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) -; GLOBALNESS1-NEXT: v_cmp_gt_i32_e32 vcc, 0, v0 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GLOBALNESS1-NEXT: v_cmp_gt_i32_e32 vcc, 1, v0 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GLOBALNESS1-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GLOBALNESS1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GLOBALNESS1-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[52:53], 1, v2 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[54:55], 1, v3 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[56:57], 1, v4 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[58:59], 1, v0 +; GLOBALNESS1-NEXT: v_cmp_gt_i32_e32 vcc, 1, v2 +; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GLOBALNESS1-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GLOBALNESS1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[54:55], 1, v0 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[56:57], 1, v1 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[58:59], 1, v3 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[60:61], 1, v2 ; GLOBALNESS1-NEXT: s_branch .LBB1_4 ; GLOBALNESS1-NEXT: .LBB1_1: ; %bb70.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[58:59] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[60:61] ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_29 ; GLOBALNESS1-NEXT: .LBB1_2: ; %Flow15 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -115,7 +115,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: .LBB1_4: ; %bb5 ; GLOBALNESS1-NEXT: ; =>This Loop Header: Depth=1 ; GLOBALNESS1-NEXT: ; Child Loop BB1_16 Depth 2 -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], s[74:75], s[74:75] op_sel:[0,1] +; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0x80 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v1, 0 ; GLOBALNESS1-NEXT: flat_load_dword v40, v[0:1] ; GLOBALNESS1-NEXT: s_add_u32 s8, s38, 40 ; GLOBALNESS1-NEXT: buffer_store_dword v42, off, s[0:3], 0 @@ -129,7 +130,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[76:77] +; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[74:75] ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[46:47] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], -1 ; GLOBALNESS1-NEXT: ; implicit-def: $sgpr4_sgpr5 @@ -165,12 +166,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[62:63], 0, v0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v1, 0x3ff00000 -; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[80:81], s[62:63] +; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[76:77], s[62:63] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_26 ; GLOBALNESS1-NEXT: ; %bb.11: ; %bb33.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[2:3], off -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[52:53] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[54:55] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_13 ; GLOBALNESS1-NEXT: ; %bb.12: ; %bb39.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -192,16 +193,16 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[4:5] ; GLOBALNESS1-NEXT: .LBB1_15: ; %bb63.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[50:51] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[52:53] ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_25 ; GLOBALNESS1-NEXT: .LBB1_16: ; %bb44.i ; GLOBALNESS1-NEXT: ; Parent Loop BB1_4 Depth=1 ; GLOBALNESS1-NEXT: ; => This Inner Loop Header: Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[60:61] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[48:49] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS1-NEXT: ; %bb.17: ; %bb46.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[48:49] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[50:51] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS1-NEXT: ; %bb.18: ; %bb50.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 @@ -216,7 +217,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[66:67] ; GLOBALNESS1-NEXT: .LBB1_21: ; %spam.exit.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[54:55] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[56:57] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS1-NEXT: ; %bb.22: ; %bb55.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 @@ -230,7 +231,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 ; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 -; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[76:77] +; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[74:75] ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[46:47], 0, 0 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] @@ -241,7 +242,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: global_store_dwordx2 v[46:47], v[44:45], off -; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[76:77] +; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[74:75] ; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[64:65] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_14 ; GLOBALNESS1-NEXT: ; %bb.23: ; %bb62.i @@ -258,12 +259,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 ; GLOBALNESS1-NEXT: .LBB1_26: ; %Flow24 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[80:81] +; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[76:77] ; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[62:63] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_2 ; GLOBALNESS1-NEXT: ; %bb.27: ; %bb67.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[56:57] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[58:59] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_1 ; GLOBALNESS1-NEXT: ; %bb.28: ; %bb69.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -325,68 +326,68 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 ; GLOBALNESS0-NEXT: global_store_dword v[0:1], v42, off ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS0-NEXT: global_load_dword v0, v42, s[72:73] +; GLOBALNESS0-NEXT: global_load_dword v2, v42, s[72:73] ; GLOBALNESS0-NEXT: s_mov_b64 s[40:41], s[4:5] ; GLOBALNESS0-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 ; GLOBALNESS0-NEXT: s_load_dword s7, s[8:9], 0x20 ; GLOBALNESS0-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GLOBALNESS0-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GLOBALNESS0-NEXT: s_add_u32 s0, s0, s17 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0 ; GLOBALNESS0-NEXT: s_addc_u32 s1, s1, 0 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, 0x40994400 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v1, 0x40994400 ; GLOBALNESS0-NEXT: s_bitcmp1_b32 s74, 0 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e32 vcc, s[4:5], v[42:43] +; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e32 vcc, s[4:5], v[0:1] ; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] ; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] ; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GLOBALNESS0-NEXT: s_bitcmp1_b32 s6, 0 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[42:43], 1, v1 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[42:43], 1, v0 +; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GLOBALNESS0-NEXT: s_bitcmp1_b32 s7, 0 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v2 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[48:49], 1, v0 +; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS0-NEXT: s_getpc_b64 s[6:7] ; GLOBALNESS0-NEXT: s_add_u32 s6, s6, wobble@gotpcrel32@lo+4 ; GLOBALNESS0-NEXT: s_addc_u32 s7, s7, wobble@gotpcrel32@hi+12 ; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[48:49], 1, v2 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] -; GLOBALNESS0-NEXT: s_load_dwordx2 s[78:79], s[6:7], 0x0 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v2 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v0 +; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GLOBALNESS0-NEXT: s_load_dwordx2 s[76:77], s[6:7], 0x0 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[52:53], 1, v0 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v1 ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[46:47], 1, v3 ; GLOBALNESS0-NEXT: s_mov_b32 s68, s16 ; GLOBALNESS0-NEXT: s_mov_b64 s[38:39], s[8:9] ; GLOBALNESS0-NEXT: s_mov_b32 s69, s15 ; GLOBALNESS0-NEXT: s_mov_b32 s70, s14 ; GLOBALNESS0-NEXT: s_mov_b64 s[34:35], s[10:11] -; GLOBALNESS0-NEXT: s_mov_b64 s[76:77], 0x80 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[60:61], 1, v1 ; GLOBALNESS0-NEXT: s_mov_b32 s32, 0 ; GLOBALNESS0-NEXT: ; implicit-def: $vgpr44_vgpr45 ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) -; GLOBALNESS0-NEXT: v_cmp_gt_i32_e32 vcc, 0, v0 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GLOBALNESS0-NEXT: v_cmp_gt_i32_e32 vcc, 1, v0 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GLOBALNESS0-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GLOBALNESS0-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GLOBALNESS0-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[52:53], 1, v2 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[54:55], 1, v3 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[56:57], 1, v4 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[58:59], 1, v0 +; GLOBALNESS0-NEXT: v_cmp_gt_i32_e32 vcc, 1, v2 +; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GLOBALNESS0-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GLOBALNESS0-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[54:55], 1, v0 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[56:57], 1, v1 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[58:59], 1, v3 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[60:61], 1, v2 ; GLOBALNESS0-NEXT: s_branch .LBB1_4 ; GLOBALNESS0-NEXT: .LBB1_1: ; %bb70.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[58:59] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[60:61] ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_29 ; GLOBALNESS0-NEXT: .LBB1_2: ; %Flow15 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -401,7 +402,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: .LBB1_4: ; %bb5 ; GLOBALNESS0-NEXT: ; =>This Loop Header: Depth=1 ; GLOBALNESS0-NEXT: ; Child Loop BB1_16 Depth 2 -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], s[76:77], s[76:77] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0x80 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v1, 0 ; GLOBALNESS0-NEXT: flat_load_dword v40, v[0:1] ; GLOBALNESS0-NEXT: s_add_u32 s8, s38, 40 ; GLOBALNESS0-NEXT: buffer_store_dword v42, off, s[0:3], 0 @@ -415,7 +417,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[78:79] +; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[76:77] ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[46:47] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], -1 ; GLOBALNESS0-NEXT: ; implicit-def: $sgpr4_sgpr5 @@ -451,12 +453,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[62:63], 0, v0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v1, 0x3ff00000 -; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[80:81], s[62:63] +; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[78:79], s[62:63] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_26 ; GLOBALNESS0-NEXT: ; %bb.11: ; %bb33.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[2:3], off -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[52:53] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[54:55] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_13 ; GLOBALNESS0-NEXT: ; %bb.12: ; %bb39.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -478,16 +480,16 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[4:5] ; GLOBALNESS0-NEXT: .LBB1_15: ; %bb63.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[50:51] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[52:53] ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_25 ; GLOBALNESS0-NEXT: .LBB1_16: ; %bb44.i ; GLOBALNESS0-NEXT: ; Parent Loop BB1_4 Depth=1 ; GLOBALNESS0-NEXT: ; => This Inner Loop Header: Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[60:61] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[48:49] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS0-NEXT: ; %bb.17: ; %bb46.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[48:49] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[50:51] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS0-NEXT: ; %bb.18: ; %bb50.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 @@ -502,7 +504,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[66:67] ; GLOBALNESS0-NEXT: .LBB1_21: ; %spam.exit.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[54:55] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[56:57] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS0-NEXT: ; %bb.22: ; %bb55.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 @@ -516,7 +518,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 ; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 -; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[78:79] +; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[76:77] ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[46:47], 0, 0 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] @@ -527,7 +529,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: global_store_dwordx2 v[46:47], v[44:45], off -; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[78:79] +; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[76:77] ; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[64:65] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_14 ; GLOBALNESS0-NEXT: ; %bb.23: ; %bb62.i @@ -544,12 +546,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 ; GLOBALNESS0-NEXT: .LBB1_26: ; %Flow24 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[80:81] +; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[78:79] ; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[62:63] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_2 ; GLOBALNESS0-NEXT: ; %bb.27: ; %bb67.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[56:57] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[58:59] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_1 ; GLOBALNESS0-NEXT: ; %bb.28: ; %bb69.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll index 7aa36a8b377bf..e809292aad1d3 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -1165,30 +1165,29 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[4:5] ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0x8000 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5] ; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 -; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB9_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v4 -; GCN-IR-NEXT: v_addc_u32_e32 v8, vcc, 0, v5, vcc ; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v4 +; GCN-IR-NEXT: v_addc_u32_e32 v8, vcc, 0, v5, vcc +; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[7:8] -; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[4:5], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execz .LBB9_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 -; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc ; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v7 ; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v6 diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll index 2f82260888a7d..86e2822a3e5b1 100644 --- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll +++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll @@ -33,8 +33,8 @@ define hidden void @widget() { ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: flat_load_dword v0, v[0:1] -; GCN-NEXT: s_mov_b64 s[16:17], 0 ; GCN-NEXT: s_mov_b64 s[20:21], -1 +; GCN-NEXT: s_mov_b64 s[16:17], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 21, v0 ; GCN-NEXT: s_mov_b64 s[46:47], 0 @@ -303,12 +303,12 @@ define hidden void @blam() { ; GCN-NEXT: s_mov_b64 s[36:37], s[8:9] ; GCN-NEXT: s_mov_b64 s[38:39], s[6:7] ; GCN-NEXT: s_mov_b64 s[40:41], s[4:5] -; GCN-NEXT: s_mov_b64 s[50:51], 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_and_b32_e32 v2, 0x3ff, v41 -; GCN-NEXT: flat_load_dword v44, v[0:1] ; GCN-NEXT: v_mov_b32_e32 v43, 0 +; GCN-NEXT: flat_load_dword v44, v[0:1] +; GCN-NEXT: s_mov_b64 s[50:51], 0 ; GCN-NEXT: s_getpc_b64 s[52:53] ; GCN-NEXT: s_add_u32 s52, s52, spam@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s53, s53, spam@rel32@hi+12 @@ -329,10 +329,10 @@ define hidden void @blam() { ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: flat_load_dword v0, v[42:43] ; GCN-NEXT: buffer_store_dword v43, off, s[0:3], 0 -; GCN-NEXT: s_mov_b64 s[4:5], -1 +; GCN-NEXT: s_mov_b64 s[6:7], 0 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_cmp_lt_i32_e32 vcc, 2, v0 -; GCN-NEXT: s_mov_b64 s[6:7], 0 +; GCN-NEXT: s_mov_b64 s[4:5], -1 ; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GCN-NEXT: s_xor_b64 s[56:57], exec, s[8:9] ; GCN-NEXT: s_cbranch_execz .LBB1_12 diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll index 91d09c01639ff..9c316612528c2 100644 --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -1158,30 +1158,29 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_addc_u32_e64 v4, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[3:4] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[3:4] ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0x8000 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[3:4] ; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v5, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 -; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB8_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v3 -; GCN-IR-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc ; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v3 +; GCN-IR-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc +; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[7:8] -; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[4:5], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execz .LBB8_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 -; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc ; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v7 ; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v6 diff --git a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll index f1edd5c74b105..0f32eb1b12771 100644 --- a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll +++ b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll @@ -257,11 +257,10 @@ define amdgpu_kernel void @test_s0_s1_k_f32(ptr addrspace(1) %out, float %a, flo ; GCN-DAG: v_mov_b32_e32 v[[VS1_SUB0:[0-9]+]], s[[SGPR1_SUB0]] ; GCN-DAG: v_mov_b32_e32 v[[VS1_SUB1:[0-9]+]], s[[SGPR1_SUB1]] -; GCN: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v[[[VS1_SUB0]]:[[VS1_SUB1]]], v[[[VZERO]]:[[VK0_SUB1]]] +; GCN-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v[[[VS1_SUB0]]:[[VS1_SUB1]]], v[[[VZERO]]:[[VK0_SUB1]]] -; Same zero component is re-used for half of each immediate. -; GCN: v_mov_b32_e32 v[[VK1_SUB1:[0-9]+]], 0x40b00000 -; GCN: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v[[[VS1_SUB0]]:[[VS1_SUB1]]], v[[[VZERO]]:[[VK1_SUB1]]] +; GCN-DAG: v_mov_b32_e32 v[[VK1_SUB1:[0-9]+]], 0x40b00000 +; GCN-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v[[[VS1_SUB0]]:[[VS1_SUB1]]], v[{{[0-9]+}}:[[VK1_SUB1]]] ; GCN: buffer_store_dwordx2 [[RESULT0]] ; GCN: buffer_store_dwordx2 [[RESULT1]] diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll index 930ba80ad6963..0f1ac360096ae 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll @@ -474,7 +474,7 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s ; SI-NEXT: successors: %bb.7(0x80000000) ; SI-NEXT: {{ $}} ; SI-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.src1.kernarg.offset, align 4, addrspace 4) - ; SI-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, killed %51, 0, implicit $exec + ; SI-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, killed %48, 0, implicit $exec ; SI-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed [[S_LOAD_DWORDX2_IMM]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; SI-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed [[V_ADDC_U32_e64_]], %subreg.sub1 ; SI-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8) from %ir.i10, addrspace 1) @@ -502,14 +502,14 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s ; SI-NEXT: bb.5.Flow: ; SI-NEXT: successors: %bb.1(0x40000000), %bb.7(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY1]](s32), %bb.0, undef %52:vgpr_32, %bb.6 + ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY1]](s32), %bb.0, undef %49:vgpr_32, %bb.6 ; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.1 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.6.sw.bb18: ; SI-NEXT: successors: %bb.5(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %36:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4 + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %33:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4 ; SI-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec ; SI-NEXT: GLOBAL_STORE_BYTE killed [[V_MOV_B2]], killed [[PHI1]], 0, 0, implicit $exec :: (store (s8) into `ptr addrspace(1) null`, addrspace 1) ; SI-NEXT: S_BRANCH %bb.5