diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 35667801c809d..3cae838321885 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -9725,6 +9725,40 @@ MachineInstr *SIInstrInfo::foldMemoryOperandImpl( return nullptr; } +bool SIInstrInfo::getRegSequenceLikeInputs( + const MachineInstr &MI, unsigned DefIdx, + SmallVectorImpl &InputRegs) const { + assert(MI.getOpcode() == AMDGPU::V_PK_MOV_B32 && + "v_pk_mov_b32 is the only reg-sequence like instruction"); + assert(DefIdx == 0); + + unsigned Src0Mods = MI.getOperand(1).getImm(); + const MachineOperand &Src0 = MI.getOperand(2); + unsigned Src1Mods = MI.getOperand(3).getImm(); + const MachineOperand &Src1 = MI.getOperand(4); + + unsigned SubReg0 = + Src0Mods & SISrcMods::OP_SEL_0 ? AMDGPU::sub1 : AMDGPU::sub0; + unsigned SubReg1 = + Src1Mods & SISrcMods::OP_SEL_0 ? AMDGPU::sub1 : AMDGPU::sub0; + + if (!Src0.isUndef()) { + // src0 will provide the result sub0 from src0. + SubReg0 = RI.composeSubRegIndices(Src0.getSubReg(), SubReg0); + InputRegs.push_back( + RegSubRegPairAndIdx(Src0.getReg(), SubReg0, AMDGPU::sub0)); + } + + if (!Src1.isUndef()) { + // src1 will provide the result's sub1 from src1. + SubReg1 = RI.composeSubRegIndices(Src1.getSubReg(), SubReg1); + InputRegs.push_back( + RegSubRegPairAndIdx(Src1.getReg(), SubReg1, AMDGPU::sub1)); + } + + return true; +} + unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost) const { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 933935a86f9f9..425ff77e8cdc3 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1437,6 +1437,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { int FrameIndex, LiveIntervals *LIS = nullptr, VirtRegMap *VRM = nullptr) const override; + bool getRegSequenceLikeInputs( + const MachineInstr &MI, unsigned DefIdx, + SmallVectorImpl &InputRegs) const override; unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 1afd68767cd3b..a6f8035e93b18 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -1107,7 +1107,7 @@ let isCommutable = 1, isReMaterializable = 1 in { defm V_PK_ADD_F32 : VOP3PInst<"v_pk_add_f32", VOP3P_Profile, any_fadd>; } // End SubtargetPredicate = HasPackedFP32Ops - let SubtargetPredicate = HasPkMovB32 in + let SubtargetPredicate = HasPkMovB32, isRegSequence = 1 in defm V_PK_MOV_B32 : VOP3PInst<"v_pk_mov_b32", VOP3P_Profile>; } // End isCommutable = 1, isReMaterializable = 1 diff --git a/llvm/test/CodeGen/AMDGPU/reg-sequence-like-v-pk-mov-b32.mir b/llvm/test/CodeGen/AMDGPU/reg-sequence-like-v-pk-mov-b32.mir new file mode 100644 index 0000000000000..602fc29bf1db3 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/reg-sequence-like-v-pk-mov-b32.mir @@ -0,0 +1,413 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=peephole-opt -o - %s | FileCheck %s + +--- +name: v_pk_mov_b32__reg_sequence_shuffle_vgpr_vgpr_1_0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: v_pk_mov_b32__reg_sequence_shuffle_vgpr_vgpr_1_0 + ; CHECK: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, [[REG_SEQUENCE]], 8, [[REG_SEQUENCE]], 0, 0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; CHECK-NEXT: $vgpr4 = COPY [[COPY2]] + ; CHECK-NEXT: $vgpr5 = COPY [[COPY3]] + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vreg_64_align2 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1 + %3:vreg_64_align2 = V_PK_MOV_B32 12, %2, 8, %2, 0, 0, 0, 0, 0, implicit $exec + %4:vgpr_32 = COPY %3.sub1 + %5:vgpr_32 = COPY %3.sub0 + $vgpr4 = COPY %4 + $vgpr5 = COPY %5 + S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + +... + +--- +name: v_pk_mov_b32__reg_sequence_shuffle_vgpr_vgpr_3_0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + + ; CHECK-LABEL: name: v_pk_mov_b32__reg_sequence_shuffle_vgpr_vgpr_3_0 + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, [[REG_SEQUENCE]], 8, [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; CHECK-NEXT: $vgpr4 = COPY [[COPY4]] + ; CHECK-NEXT: $vgpr5 = COPY [[COPY5]] + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr2 + %3:vgpr_32 = COPY $vgpr3 + %4:vreg_64_align2 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1 + %5:vreg_64_align2 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1 + %6:vreg_64_align2 = V_PK_MOV_B32 12, %4, 8, %5, 0, 0, 0, 0, 0, implicit $exec + %7:vgpr_32 = COPY %6.sub1 + %8:vgpr_32 = COPY %6.sub0 + $vgpr4 = COPY %7 + $vgpr5 = COPY %8 + S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + +... + +--- +name: v_pk_mov_b32__reg_sequence_shuffle_sgpr_sgpr_1_0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr8, $sgpr9 + + ; CHECK-LABEL: name: v_pk_mov_b32__reg_sequence_shuffle_sgpr_sgpr_1_0 + ; CHECK: liveins: $sgpr8, $sgpr9 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr8 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr9 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, [[REG_SEQUENCE]], 8, [[REG_SEQUENCE]], 0, 0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub1 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub0 + ; CHECK-NEXT: $vgpr4 = COPY [[COPY2]] + ; CHECK-NEXT: $vgpr5 = COPY [[COPY3]] + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + %0:sreg_32 = COPY $sgpr8 + %1:sreg_32 = COPY $sgpr9 + %2:sreg_64 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1 + %3:vreg_64_align2 = V_PK_MOV_B32 12, %2, 8, %2, 0, 0, 0, 0, 0, implicit $exec + %4:vgpr_32 = COPY %3.sub1 + %5:vgpr_32 = COPY %3.sub0 + $vgpr4 = COPY %4 + $vgpr5 = COPY %5 + S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + +... + +--- +name: v_pk_mov_b32__reg_sequence_shuffle_vgpr_sgpr_3_0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr8, $vgpr9, $sgpr10, $sgpr11 + + ; CHECK-LABEL: name: v_pk_mov_b32__reg_sequence_shuffle_vgpr_sgpr_3_0 + ; CHECK: liveins: $vgpr8, $vgpr9, $sgpr10, $sgpr11 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY $sgpr10 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY $sgpr11 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, [[REG_SEQUENCE]], 8, [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub1 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; CHECK-NEXT: $vgpr4 = COPY [[COPY4]] + ; CHECK-NEXT: $vgpr5 = COPY [[COPY5]] + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + %0:vgpr_32 = COPY $vgpr8 + %1:vgpr_32 = COPY $vgpr9 + %2:sreg_64 = COPY $sgpr10 + %3:sreg_64 = COPY $sgpr11 + %4:vreg_64_align2 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1 + %5:sreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1 + %6:vreg_64_align2 = V_PK_MOV_B32 12, %4, 8, %5, 0, 0, 0, 0, 0, implicit $exec + %7:vgpr_32 = COPY %6.sub1 + %8:vgpr_32 = COPY %6.sub0 + $vgpr4 = COPY %7 + $vgpr5 = COPY %8 + S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + +... + +--- +name: v_pk_mov_b32__reg_sequence_shuffle_sgpr_vgpr_3_0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr8, $sgpr9, $vgpr10, $vgpr11 + + ; CHECK-LABEL: name: v_pk_mov_b32__reg_sequence_shuffle_sgpr_vgpr_3_0 + ; CHECK: liveins: $sgpr8, $sgpr9, $vgpr10, $vgpr11 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr8 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr9 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, [[REG_SEQUENCE]], 8, [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub0 + ; CHECK-NEXT: $vgpr4 = COPY [[COPY4]] + ; CHECK-NEXT: $vgpr5 = COPY [[COPY5]] + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + %0:sreg_32 = COPY $sgpr8 + %1:sreg_32 = COPY $sgpr9 + %2:vgpr_32 = COPY $vgpr10 + %3:vgpr_32 = COPY $vgpr11 + %4:sreg_64 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1 + %5:vreg_64_align2 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1 + %6:vreg_64_align2 = V_PK_MOV_B32 12, %4, 8, %5, 0, 0, 0, 0, 0, implicit $exec + %7:vgpr_32 = COPY %6.sub1 + %8:vgpr_32 = COPY %6.sub0 + $vgpr4 = COPY %7 + $vgpr5 = COPY %8 + S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + +... + +--- +name: v_pk_mov_b32__reg_sequence_shuffle_vgpr_vgpr_3_0_undef_lhs +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + + ; CHECK-LABEL: name: v_pk_mov_b32__reg_sequence_shuffle_vgpr_vgpr_3_0_undef_lhs + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, undef [[REG_SEQUENCE]], 8, [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub0 + ; CHECK-NEXT: $vgpr4 = COPY [[COPY4]] + ; CHECK-NEXT: $vgpr5 = COPY [[COPY5]] + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr2 + %3:vgpr_32 = COPY $vgpr3 + %4:vreg_64_align2 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1 + %5:vreg_64_align2 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1 + %6:vreg_64_align2 = V_PK_MOV_B32 12, undef %4, 8, %5, 0, 0, 0, 0, 0, implicit $exec + %7:vgpr_32 = COPY %6.sub1 + %8:vgpr_32 = COPY %6.sub0 + $vgpr4 = COPY %7 + $vgpr5 = COPY %8 + S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + +... + +--- +name: v_pk_mov_b32__reg_sequence_shuffle_vgpr_vgpr_3_0_undef_rhs +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + + ; CHECK-LABEL: name: v_pk_mov_b32__reg_sequence_shuffle_vgpr_vgpr_3_0_undef_rhs + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, [[REG_SEQUENCE]], 8, undef [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub1 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; CHECK-NEXT: $vgpr4 = COPY [[COPY4]] + ; CHECK-NEXT: $vgpr5 = COPY [[COPY5]] + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr2 + %3:vgpr_32 = COPY $vgpr3 + %4:vreg_64_align2 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1 + %5:vreg_64_align2 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1 + %6:vreg_64_align2 = V_PK_MOV_B32 12, %4, 8, undef %5, 0, 0, 0, 0, 0, implicit $exec + %7:vgpr_32 = COPY %6.sub1 + %8:vgpr_32 = COPY %6.sub0 + $vgpr4 = COPY %7 + $vgpr5 = COPY %8 + S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + +... + +--- +name: v_pk_mov_b32__reg_sequence_shuffle_vgpr_vgpr_3_0_undef_undef +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + + ; CHECK-LABEL: name: v_pk_mov_b32__reg_sequence_shuffle_vgpr_vgpr_3_0_undef_undef + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, undef [[REG_SEQUENCE]], 8, undef [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub1 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub0 + ; CHECK-NEXT: $vgpr4 = COPY [[COPY4]] + ; CHECK-NEXT: $vgpr5 = COPY [[COPY5]] + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr2 + %3:vgpr_32 = COPY $vgpr3 + %4:vreg_64_align2 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1 + %5:vreg_64_align2 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1 + %6:vreg_64_align2 = V_PK_MOV_B32 12, undef %4, 8, undef %5, 0, 0, 0, 0, 0, implicit $exec + %7:vgpr_32 = COPY %6.sub1 + %8:vgpr_32 = COPY %6.sub0 + $vgpr4 = COPY %7 + $vgpr5 = COPY %8 + S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + +... + +--- +name: v_pk_mov_b32__reg_sequence_shuffle_vgpr_vgpr_3_0_compose_src_subregs_0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + + ; CHECK-LABEL: name: v_pk_mov_b32__reg_sequence_shuffle_vgpr_vgpr_3_0_compose_src_subregs_0 + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY5]], %subreg.sub2 + ; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, [[REG_SEQUENCE]].sub0_sub1, 8, [[REG_SEQUENCE1]].sub0_sub1, 0, 0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; CHECK-NEXT: $vgpr4 = COPY [[COPY6]] + ; CHECK-NEXT: $vgpr5 = COPY [[COPY7]] + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr2 + %3:vgpr_32 = COPY $vgpr3 + %4:vgpr_32 = COPY $vgpr4 + %5:vgpr_32 = COPY $vgpr5 + %6:vreg_96_align2 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2 + %7:vreg_96_align2 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1, %5, %subreg.sub2 + %8:vreg_64_align2 = V_PK_MOV_B32 12, %6.sub0_sub1, 8, %7.sub0_sub1, 0, 0, 0, 0, 0, implicit $exec + %9:vgpr_32 = COPY %8.sub1 + %10:vgpr_32 = COPY %8.sub0 + $vgpr4 = COPY %9 + $vgpr5 = COPY %10 + S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + +... + +--- +name: v_pk_mov_b32__reg_sequence_shuffle_vgpr_vgpr_0_3_compose_src_subregs_0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + + ; CHECK-LABEL: name: v_pk_mov_b32__reg_sequence_shuffle_vgpr_vgpr_0_3_compose_src_subregs_0 + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY5]], %subreg.sub2 + ; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 8, [[REG_SEQUENCE]].sub0_sub1, 12, [[REG_SEQUENCE1]].sub0_sub1, 0, 0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; CHECK-NEXT: $vgpr4 = COPY [[COPY6]] + ; CHECK-NEXT: $vgpr5 = COPY [[COPY7]] + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr2 + %3:vgpr_32 = COPY $vgpr3 + %4:vgpr_32 = COPY $vgpr4 + %5:vgpr_32 = COPY $vgpr5 + %6:vreg_96_align2 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2 + %7:vreg_96_align2 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1, %5, %subreg.sub2 + %8:vreg_64_align2 = V_PK_MOV_B32 8, %6.sub0_sub1, 12, %7.sub0_sub1, 0, 0, 0, 0, 0, implicit $exec + %9:vgpr_32 = COPY %8.sub1 + %10:vgpr_32 = COPY %8.sub0 + $vgpr4 = COPY %9 + $vgpr5 = COPY %10 + S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + +... + +--- +name: v_pk_mov_b32__reg_sequence_shuffle_vgpr_vgpr_compose_src_subregs_1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 + + ; CHECK-LABEL: name: v_pk_mov_b32__reg_sequence_shuffle_vgpr_vgpr_compose_src_subregs_1 + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr7 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, [[REG_SEQUENCE]].sub2_sub3, 12, [[REG_SEQUENCE1]].sub0_sub1, 0, 0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub3 + ; CHECK-NEXT: $vgpr4 = COPY [[COPY8]] + ; CHECK-NEXT: $vgpr5 = COPY [[COPY9]] + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr2 + %3:vgpr_32 = COPY $vgpr3 + %4:vgpr_32 = COPY $vgpr4 + %5:vgpr_32 = COPY $vgpr5 + %6:vgpr_32 = COPY $vgpr6 + %7:vgpr_32 = COPY $vgpr7 + %8:vreg_128_align2 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 + %9:vreg_128_align2 = REG_SEQUENCE %4, %subreg.sub0, %5, %subreg.sub1, %6, %subreg.sub2, %7, %subreg.sub3 + %10:vreg_64_align2 = V_PK_MOV_B32 12, %8.sub2_sub3, 12, %9.sub0_sub1, 0, 0, 0, 0, 0, implicit $exec + %11:vgpr_32 = COPY %10.sub1 + %12:vgpr_32 = COPY %10.sub0 + $vgpr4 = COPY %11 + $vgpr5 = COPY %12 + S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + +... diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll index 9e3c044a76295..fafdac7ffac62 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll @@ -176,7 +176,8 @@ define void @v_shuffle_v4f32_v2f32__3_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -191,8 +192,8 @@ define void @v_shuffle_v4f32_v2f32__3_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -565,15 +566,16 @@ define void @v_shuffle_v4f32_v2f32__3_3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -581,15 +583,16 @@ define void @v_shuffle_v4f32_v2f32__3_3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ; def v[4:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -672,26 +675,31 @@ define void @v_shuffle_v4f32_v2f32__3_3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_3_3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_3_3_2: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ; def v[4:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -865,7 +873,8 @@ define void @v_shuffle_v4f32_v2f32__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -878,7 +887,8 @@ define void @v_shuffle_v4f32_v2f32__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -962,7 +972,8 @@ define void @v_shuffle_v4f32_v2f32__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -978,7 +989,8 @@ define void @v_shuffle_v4f32_v2f32__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1126,13 +1138,14 @@ define void @v_shuffle_v4f32_v2f32__3_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1142,13 +1155,14 @@ define void @v_shuffle_v4f32_v2f32__3_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ; def v[4:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -1289,15 +1303,16 @@ define void @v_shuffle_v4f32_v2f32__3_3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1305,15 +1320,16 @@ define void @v_shuffle_v4f32_v2f32__3_3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ; def v[4:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -1707,7 +1723,8 @@ define void @v_shuffle_v4f32_v2f32__3_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1723,7 +1740,8 @@ define void @v_shuffle_v4f32_v2f32__3_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: v_mov_b32_e32 v2, v3 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1758,13 +1776,14 @@ define void @v_shuffle_v4f32_v2f32__3_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1774,13 +1793,14 @@ define void @v_shuffle_v4f32_v2f32__3_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ; def v[4:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -2132,7 +2152,8 @@ define void @v_shuffle_v4f32_v2f32__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -2145,7 +2166,8 @@ define void @v_shuffle_v4f32_v2f32__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -2224,15 +2246,16 @@ define void @v_shuffle_v4f32_v2f32__3_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2240,15 +2263,16 @@ define void @v_shuffle_v4f32_v2f32__3_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ; def v[4:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -2492,13 +2516,15 @@ define void @v_shuffle_v4f32_v2f32__3_3_1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2508,13 +2534,15 @@ define void @v_shuffle_v4f32_v2f32__3_3_1_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ; def v[4:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -2798,15 +2826,16 @@ define void @v_shuffle_v4f32_v2f32__3_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2814,15 +2843,16 @@ define void @v_shuffle_v4f32_v2f32__3_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ; def v[4:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -2908,7 +2938,8 @@ define void @v_shuffle_v4f32_v2f32__3_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -2921,7 +2952,8 @@ define void @v_shuffle_v4f32_v2f32__3_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: v_mov_b32_e32 v2, v3 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll index 31c458f5338cb..c797ef338dee9 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll @@ -1280,7 +1280,8 @@ define void @v_shuffle_v4f32_v3f32__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -1295,7 +1296,8 @@ define void @v_shuffle_v4f32_v3f32__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[4:6] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: v_mov_b32_e32 v2, v4 ; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 @@ -1433,7 +1435,8 @@ define void @v_shuffle_v4f32_v3f32__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -1450,8 +1453,8 @@ define void @v_shuffle_v4f32_v3f32__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:6] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: v_mov_b32_e32 v2, v4 ; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 @@ -1961,14 +1964,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1978,15 +1982,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ; def v[4:6] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2138,16 +2142,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2155,17 +2160,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_0(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ; def v[4:6] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v9, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ; def v[6:8] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3272,9 +3277,9 @@ define void @v_shuffle_v4f32_v3f32__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -3287,8 +3292,8 @@ define void @v_shuffle_v4f32_v3f32__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v2 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -3416,12 +3421,12 @@ define void @v_shuffle_v4f32_v3f32__4_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -3434,12 +3439,12 @@ define void @v_shuffle_v4f32_v3f32__4_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v7, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v2 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:6] ; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -3986,17 +3991,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4004,17 +4009,18 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ; def v[2:4] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v7, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4106,17 +4112,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4124,18 +4130,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ; def v[2:4] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v7, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ; def v[6:8] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4312,7 +4317,8 @@ define void @v_shuffle_v4f32_v3f32__4_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -4327,7 +4333,8 @@ define void @v_shuffle_v4f32_v3f32__4_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[4:6] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: v_mov_b32_e32 v2, v4 ; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 @@ -4851,16 +4858,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4868,17 +4876,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_3(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ; def v[2:4] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ; def v[4:6] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4964,28 +4972,31 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_4_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_4_3: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ; def v[4:6] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v5, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -6083,9 +6094,9 @@ define void @v_shuffle_v4f32_v3f32__1_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6102,8 +6113,8 @@ define void @v_shuffle_v4f32_v3f32__1_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v2 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -6241,9 +6252,9 @@ define void @v_shuffle_v4f32_v3f32__4_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6256,8 +6267,8 @@ define void @v_shuffle_v4f32_v3f32__4_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v2 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -6716,16 +6727,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6733,17 +6745,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ; def v[2:4] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ; def v[4:6] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v4 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v6 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -6881,10 +6893,11 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -6897,8 +6910,9 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_5(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v5, 0 ; GFX940-NEXT: v_mov_b32_e32 v0, v4 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll index e3427cd35c683..40d6ad90ab34b 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll @@ -343,12 +343,13 @@ define void @v_shuffle_v4f32_v4f32__7_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -363,8 +364,9 @@ define void @v_shuffle_v4f32_v4f32__7_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -454,7 +456,8 @@ define void @v_shuffle_v4f32_v4f32__7_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -469,8 +472,8 @@ define void @v_shuffle_v4f32_v4f32__7_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -557,8 +560,9 @@ define void @v_shuffle_v4f32_v4f32__7_4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -569,8 +573,9 @@ define void @v_shuffle_v4f32_v4f32__7_4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] -; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -644,7 +649,8 @@ define void @v_shuffle_v4f32_v4f32__7_6_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -656,7 +662,8 @@ define void @v_shuffle_v4f32_v4f32__7_6_u_u(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -1162,10 +1169,11 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1180,10 +1188,11 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1278,16 +1287,17 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v7 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1295,17 +1305,17 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v7 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1392,26 +1402,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_7_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_7_4: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1493,26 +1508,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_7_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_7_6: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1689,7 +1709,8 @@ define void @v_shuffle_v4f32_v4f32__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -1703,7 +1724,8 @@ define void @v_shuffle_v4f32_v4f32__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 ; GFX940-NEXT: v_mov_b32_e32 v4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -1787,7 +1809,8 @@ define void @v_shuffle_v4f32_v4f32__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -1801,7 +1824,8 @@ define void @v_shuffle_v4f32_v4f32__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 ; GFX940-NEXT: v_mov_b32_e32 v4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -1890,7 +1914,8 @@ define void @v_shuffle_v4f32_v4f32__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -1908,7 +1933,8 @@ define void @v_shuffle_v4f32_v4f32__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 ; GFX940-NEXT: v_mov_b32_e32 v4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -2012,7 +2038,8 @@ define void @v_shuffle_v4f32_v4f32__7_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -2030,7 +2057,8 @@ define void @v_shuffle_v4f32_v4f32__7_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 ; GFX940-NEXT: v_mov_b32_e32 v4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -2186,16 +2214,17 @@ define void @v_shuffle_v4f32_v4f32__7_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2203,16 +2232,17 @@ define void @v_shuffle_v4f32_v4f32__7_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v4, v0 -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -2309,11 +2339,12 @@ define void @v_shuffle_v4f32_v4f32__7_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2323,15 +2354,15 @@ define void @v_shuffle_v4f32_v4f32__7_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v4, v0 -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -2429,7 +2460,8 @@ define void @v_shuffle_v4f32_v4f32__7_6_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -2447,7 +2479,8 @@ define void @v_shuffle_v4f32_v4f32__7_6_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: v_mov_b32_e32 v4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -2610,10 +2643,11 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2628,10 +2662,11 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -2724,16 +2759,17 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2741,16 +2777,17 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_0(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -2846,11 +2883,11 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2860,15 +2897,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -3481,7 +3517,8 @@ define void @v_shuffle_v4f32_v4f32__7_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v1 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -3499,7 +3536,8 @@ define void @v_shuffle_v4f32_v4f32__7_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 ; GFX940-NEXT: v_mov_b32_e32 v4, v1 ; GFX940-NEXT: v_mov_b32_e32 v5, v1 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -3535,16 +3573,17 @@ define void @v_shuffle_v4f32_v4f32__7_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3552,16 +3591,17 @@ define void @v_shuffle_v4f32_v4f32__7_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v4, v1 -; GFX940-NEXT: v_mov_b32_e32 v5, v1 -; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -3657,11 +3697,12 @@ define void @v_shuffle_v4f32_v4f32__7_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3671,15 +3712,15 @@ define void @v_shuffle_v4f32_v4f32__7_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v4, v1 -; GFX940-NEXT: v_mov_b32_e32 v5, v1 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -3777,7 +3818,8 @@ define void @v_shuffle_v4f32_v4f32__7_6_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: v_mov_b32_e32 v4, v1 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -3795,7 +3837,8 @@ define void @v_shuffle_v4f32_v4f32__7_6_1_1(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: v_mov_b32_e32 v4, v1 ; GFX940-NEXT: v_mov_b32_e32 v5, v1 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -4401,7 +4444,8 @@ define void @v_shuffle_v4f32_v4f32__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -4414,7 +4458,8 @@ define void @v_shuffle_v4f32_v4f32__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -4495,7 +4540,8 @@ define void @v_shuffle_v4f32_v4f32__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -4508,7 +4554,8 @@ define void @v_shuffle_v4f32_v4f32__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -4592,7 +4639,8 @@ define void @v_shuffle_v4f32_v4f32__5_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -4608,8 +4656,8 @@ define void @v_shuffle_v4f32_v4f32__5_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -4707,7 +4755,8 @@ define void @v_shuffle_v4f32_v4f32__7_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -4723,8 +4772,8 @@ define void @v_shuffle_v4f32_v4f32__7_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -4814,15 +4863,17 @@ define void @v_shuffle_v4f32_v4f32__7_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4830,15 +4881,17 @@ define void @v_shuffle_v4f32_v4f32__7_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0] -; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -4989,7 +5042,8 @@ define void @v_shuffle_v4f32_v4f32__7_4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -5006,7 +5060,8 @@ define void @v_shuffle_v4f32_v4f32__7_4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -5103,7 +5158,8 @@ define void @v_shuffle_v4f32_v4f32__7_6_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -5120,7 +5176,8 @@ define void @v_shuffle_v4f32_v4f32__7_6_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -5331,16 +5388,17 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5348,16 +5406,17 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5390,16 +5449,17 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5407,16 +5467,17 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5507,16 +5568,16 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v7 +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5524,17 +5585,16 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v10, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v7 +; GFX940-NEXT: v_mov_b32_e32 v8, v5 +; GFX940-NEXT: v_mov_b32_e32 v9, v2 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -6133,15 +6193,17 @@ define void @v_shuffle_v4f32_v4f32__7_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6149,15 +6211,17 @@ define void @v_shuffle_v4f32_v4f32__7_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0] -; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -6250,7 +6314,8 @@ define void @v_shuffle_v4f32_v4f32__7_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6266,8 +6331,8 @@ define void @v_shuffle_v4f32_v4f32__7_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 ; GFX940-NEXT: v_mov_b32_e32 v2, v3 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -6307,7 +6372,8 @@ define void @v_shuffle_v4f32_v4f32__7_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6324,7 +6390,8 @@ define void @v_shuffle_v4f32_v4f32__7_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -6421,7 +6488,8 @@ define void @v_shuffle_v4f32_v4f32__7_6_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6438,7 +6506,8 @@ define void @v_shuffle_v4f32_v4f32__7_6_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -7126,7 +7195,8 @@ define void @v_shuffle_v4f32_v4f32__5_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -7140,7 +7210,8 @@ define void @v_shuffle_v4f32_v4f32__5_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 ; GFX940-NEXT: v_mov_b32_e32 v4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -7226,7 +7297,8 @@ define void @v_shuffle_v4f32_v4f32__7_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -7240,7 +7312,8 @@ define void @v_shuffle_v4f32_v4f32__7_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 ; GFX940-NEXT: v_mov_b32_e32 v4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -7331,7 +7404,8 @@ define void @v_shuffle_v4f32_v4f32__7_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_mov_b32_e32 v6, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, v2 ; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] @@ -7349,7 +7423,8 @@ define void @v_shuffle_v4f32_v4f32__7_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: v_mov_b32_e32 v6, v2 ; GFX940-NEXT: v_mov_b32_e32 v7, v2 ; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 @@ -7448,7 +7523,8 @@ define void @v_shuffle_v4f32_v4f32__7_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] @@ -7465,8 +7541,8 @@ define void @v_shuffle_v4f32_v4f32__7_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 ; GFX940-NEXT: v_mov_b32_e32 v2, v4 ; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 @@ -7610,28 +7686,31 @@ define void @v_shuffle_v4f32_v4f32__7_6_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_6_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_6_4_4: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v4, v0 -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7825,11 +7904,11 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7839,15 +7918,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7944,9 +8022,10 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v7 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -7961,10 +8040,10 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[4:5] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v3 ; GFX940-NEXT: v_mov_b32_e32 v0, v7 ; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -7997,8 +8076,9 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -8010,8 +8090,9 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -8622,10 +8703,12 @@ define void @v_shuffle_v4f32_v4f32__7_0_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8635,14 +8718,16 @@ define void @v_shuffle_v4f32_v4f32__7_0_5_5(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -8737,7 +8822,8 @@ define void @v_shuffle_v4f32_v4f32__7_2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, v5 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] @@ -8754,8 +8840,8 @@ define void @v_shuffle_v4f32_v4f32__7_2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 ; GFX940-NEXT: v_mov_b32_e32 v2, v5 ; GFX940-NEXT: v_mov_b32_e32 v3, v5 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 @@ -8852,7 +8938,8 @@ define void @v_shuffle_v4f32_v4f32__7_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v1 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -8866,7 +8953,8 @@ define void @v_shuffle_v4f32_v4f32__7_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 ; GFX940-NEXT: v_mov_b32_e32 v4, v1 ; GFX940-NEXT: v_mov_b32_e32 v5, v1 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -8897,28 +8985,31 @@ define void @v_shuffle_v4f32_v4f32__7_6_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_6_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_6_5_5: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v4, v1 -; GFX940-NEXT: v_mov_b32_e32 v5, v1 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -9493,7 +9584,8 @@ define void @v_shuffle_v4f32_v4f32__1_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -9510,7 +9602,8 @@ define void @v_shuffle_v4f32_v4f32__1_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: v_mov_b32_e32 v5, v4 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -9608,7 +9701,8 @@ define void @v_shuffle_v4f32_v4f32__3_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[6:7] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 ; GFX90A-NEXT: v_mov_b32_e32 v7, v6 ; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -9624,8 +9718,8 @@ define void @v_shuffle_v4f32_v4f32__3_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[6:7] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v5, v6 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 ; GFX940-NEXT: v_mov_b32_e32 v7, v6 ; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -9705,7 +9799,8 @@ define void @v_shuffle_v4f32_v4f32__5_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -9718,7 +9813,8 @@ define void @v_shuffle_v4f32_v4f32__5_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -9801,7 +9897,8 @@ define void @v_shuffle_v4f32_v4f32__7_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -9814,7 +9911,8 @@ define void @v_shuffle_v4f32_v4f32__7_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -9900,7 +9998,8 @@ define void @v_shuffle_v4f32_v4f32__7_0_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -9917,7 +10016,8 @@ define void @v_shuffle_v4f32_v4f32__7_0_6_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v4 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -10016,7 +10116,8 @@ define void @v_shuffle_v4f32_v4f32__7_2_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, v6 ; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -10032,8 +10133,8 @@ define void @v_shuffle_v4f32_v4f32__7_2_6_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 ; GFX940-NEXT: v_mov_b32_e32 v7, v6 ; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -10121,26 +10222,31 @@ define void @v_shuffle_v4f32_v4f32__7_4_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_4_6_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_4_6_6: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -10380,14 +10486,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v4 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10397,15 +10504,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v10, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v8, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v5 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v9, v4 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -10503,9 +10610,10 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v7 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10520,10 +10628,10 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[6:7] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v3 ; GFX940-NEXT: v_mov_b32_e32 v0, v7 ; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v3, v6 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -10606,26 +10714,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_5_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_5_6: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -11133,7 +11246,8 @@ define void @v_shuffle_v4f32_v4f32__7_0_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v5 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -11150,7 +11264,8 @@ define void @v_shuffle_v4f32_v4f32__7_0_7_7(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 ; GFX940-NEXT: v_mov_b32_e32 v4, v5 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -11249,7 +11364,8 @@ define void @v_shuffle_v4f32_v4f32__7_2_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 ; GFX90A-NEXT: v_mov_b32_e32 v6, v7 ; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -11265,8 +11381,8 @@ define void @v_shuffle_v4f32_v4f32__7_2_7_7(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 ; GFX940-NEXT: v_mov_b32_e32 v6, v7 ; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -11355,26 +11471,31 @@ define void @v_shuffle_v4f32_v4f32__7_4_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_4_7_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_4_7_7: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -11452,7 +11573,8 @@ define void @v_shuffle_v4f32_v4f32__7_6_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -11465,7 +11587,8 @@ define void @v_shuffle_v4f32_v4f32__7_6_7_7(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: v_mov_b32_e32 v2, v3 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll index 94c61f5ad0e86..86e8e2ed267dc 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll @@ -176,7 +176,8 @@ define void @v_shuffle_v4i32_v2i32__3_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -191,8 +192,8 @@ define void @v_shuffle_v4i32_v2i32__3_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -565,15 +566,16 @@ define void @v_shuffle_v4i32_v2i32__3_3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -581,15 +583,16 @@ define void @v_shuffle_v4i32_v2i32__3_3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ; def v[4:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -672,26 +675,31 @@ define void @v_shuffle_v4i32_v2i32__3_3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_3_3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_3_3_2: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ; def v[4:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -868,7 +876,8 @@ define void @v_shuffle_v4i32_v2i32__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -881,7 +890,8 @@ define void @v_shuffle_v4i32_v2i32__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -968,7 +978,8 @@ define void @v_shuffle_v4i32_v2i32__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -984,7 +995,8 @@ define void @v_shuffle_v4i32_v2i32__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1132,13 +1144,14 @@ define void @v_shuffle_v4i32_v2i32__3_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1148,13 +1161,14 @@ define void @v_shuffle_v4i32_v2i32__3_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ; def v[4:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -1295,15 +1309,16 @@ define void @v_shuffle_v4i32_v2i32__3_3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1311,15 +1326,16 @@ define void @v_shuffle_v4i32_v2i32__3_3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ; def v[4:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -1713,7 +1729,8 @@ define void @v_shuffle_v4i32_v2i32__3_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1729,7 +1746,8 @@ define void @v_shuffle_v4i32_v2i32__3_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: v_mov_b32_e32 v2, v3 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1764,13 +1782,14 @@ define void @v_shuffle_v4i32_v2i32__3_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1780,13 +1799,14 @@ define void @v_shuffle_v4i32_v2i32__3_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ; def v[4:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -2138,7 +2158,8 @@ define void @v_shuffle_v4i32_v2i32__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -2151,7 +2172,8 @@ define void @v_shuffle_v4i32_v2i32__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -2230,15 +2252,16 @@ define void @v_shuffle_v4i32_v2i32__3_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2246,15 +2269,16 @@ define void @v_shuffle_v4i32_v2i32__3_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ; def v[4:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -2498,13 +2522,15 @@ define void @v_shuffle_v4i32_v2i32__3_3_1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2514,13 +2540,15 @@ define void @v_shuffle_v4i32_v2i32__3_3_1_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ; def v[4:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -2804,15 +2832,16 @@ define void @v_shuffle_v4i32_v2i32__3_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2820,15 +2849,16 @@ define void @v_shuffle_v4i32_v2i32__3_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ; def v[4:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -2914,7 +2944,8 @@ define void @v_shuffle_v4i32_v2i32__3_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -2927,7 +2958,8 @@ define void @v_shuffle_v4i32_v2i32__3_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: v_mov_b32_e32 v2, v3 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll index 1b003a7c5d9bc..770c1c19087b4 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll @@ -1280,7 +1280,8 @@ define void @v_shuffle_v4i32_v3i32__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -1295,7 +1296,8 @@ define void @v_shuffle_v4i32_v3i32__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[4:6] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: v_mov_b32_e32 v2, v4 ; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 @@ -1433,7 +1435,8 @@ define void @v_shuffle_v4i32_v3i32__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -1450,8 +1453,8 @@ define void @v_shuffle_v4i32_v3i32__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:6] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: v_mov_b32_e32 v2, v4 ; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 @@ -1961,14 +1964,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1978,15 +1982,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ; def v[4:6] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2138,16 +2142,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2155,17 +2160,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_0(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ; def v[4:6] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v9, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ; def v[6:8] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3272,9 +3277,9 @@ define void @v_shuffle_v4i32_v3i32__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -3287,8 +3292,8 @@ define void @v_shuffle_v4i32_v3i32__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v2 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -3416,12 +3421,12 @@ define void @v_shuffle_v4i32_v3i32__4_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -3434,12 +3439,12 @@ define void @v_shuffle_v4i32_v3i32__4_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v7, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v2 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:6] ; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -3986,17 +3991,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4004,17 +4009,18 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ; def v[2:4] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v7, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4106,17 +4112,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4124,18 +4130,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ; def v[2:4] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v7, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ; def v[6:8] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4312,7 +4317,8 @@ define void @v_shuffle_v4i32_v3i32__4_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -4327,7 +4333,8 @@ define void @v_shuffle_v4i32_v3i32__4_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[4:6] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: v_mov_b32_e32 v2, v4 ; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 @@ -4851,16 +4858,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4868,17 +4876,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_3(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ; def v[2:4] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ; def v[4:6] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4964,28 +4972,31 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_4_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_4_3: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ; def v[4:6] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v5, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -6083,9 +6094,9 @@ define void @v_shuffle_v4i32_v3i32__1_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6102,8 +6113,8 @@ define void @v_shuffle_v4i32_v3i32__1_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v2 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -6241,9 +6252,9 @@ define void @v_shuffle_v4i32_v3i32__4_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6256,8 +6267,8 @@ define void @v_shuffle_v4i32_v3i32__4_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v2 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -6716,16 +6727,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6733,17 +6745,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ; def v[2:4] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ; def v[4:6] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v4 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v6 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -6881,10 +6893,11 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -6897,8 +6910,9 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_5(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v5, 0 ; GFX940-NEXT: v_mov_b32_e32 v0, v4 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll index 47ad1c4bedb8b..3d1bb1da92e09 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll @@ -343,12 +343,13 @@ define void @v_shuffle_v4i32_v4i32__7_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -363,8 +364,9 @@ define void @v_shuffle_v4i32_v4i32__7_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -454,7 +456,8 @@ define void @v_shuffle_v4i32_v4i32__7_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -469,8 +472,8 @@ define void @v_shuffle_v4i32_v4i32__7_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -557,8 +560,9 @@ define void @v_shuffle_v4i32_v4i32__7_4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -569,8 +573,9 @@ define void @v_shuffle_v4i32_v4i32__7_4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] -; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -644,7 +649,8 @@ define void @v_shuffle_v4i32_v4i32__7_6_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -656,7 +662,8 @@ define void @v_shuffle_v4i32_v4i32__7_6_u_u(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -1162,10 +1169,11 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1180,10 +1188,11 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1278,16 +1287,17 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v7 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1295,17 +1305,17 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v7 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1392,26 +1402,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_7_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_7_4: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1493,26 +1508,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_7_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_7_6: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1689,7 +1709,8 @@ define void @v_shuffle_v4i32_v4i32__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -1703,7 +1724,8 @@ define void @v_shuffle_v4i32_v4i32__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 ; GFX940-NEXT: v_mov_b32_e32 v4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -1787,7 +1809,8 @@ define void @v_shuffle_v4i32_v4i32__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -1801,7 +1824,8 @@ define void @v_shuffle_v4i32_v4i32__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 ; GFX940-NEXT: v_mov_b32_e32 v4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -1890,7 +1914,8 @@ define void @v_shuffle_v4i32_v4i32__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -1908,7 +1933,8 @@ define void @v_shuffle_v4i32_v4i32__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 ; GFX940-NEXT: v_mov_b32_e32 v4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -2012,7 +2038,8 @@ define void @v_shuffle_v4i32_v4i32__7_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -2030,7 +2057,8 @@ define void @v_shuffle_v4i32_v4i32__7_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 ; GFX940-NEXT: v_mov_b32_e32 v4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -2186,16 +2214,17 @@ define void @v_shuffle_v4i32_v4i32__7_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2203,16 +2232,17 @@ define void @v_shuffle_v4i32_v4i32__7_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v4, v0 -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -2309,11 +2339,12 @@ define void @v_shuffle_v4i32_v4i32__7_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2323,15 +2354,15 @@ define void @v_shuffle_v4i32_v4i32__7_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v4, v0 -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -2429,7 +2460,8 @@ define void @v_shuffle_v4i32_v4i32__7_6_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -2447,7 +2479,8 @@ define void @v_shuffle_v4i32_v4i32__7_6_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: v_mov_b32_e32 v4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -2610,10 +2643,11 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2628,10 +2662,11 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -2724,16 +2759,17 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2741,16 +2777,17 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_0(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -2846,11 +2883,11 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2860,15 +2897,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -3481,7 +3517,8 @@ define void @v_shuffle_v4i32_v4i32__7_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v1 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -3499,7 +3536,8 @@ define void @v_shuffle_v4i32_v4i32__7_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 ; GFX940-NEXT: v_mov_b32_e32 v4, v1 ; GFX940-NEXT: v_mov_b32_e32 v5, v1 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -3535,16 +3573,17 @@ define void @v_shuffle_v4i32_v4i32__7_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3552,16 +3591,17 @@ define void @v_shuffle_v4i32_v4i32__7_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v4, v1 -; GFX940-NEXT: v_mov_b32_e32 v5, v1 -; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -3657,11 +3697,12 @@ define void @v_shuffle_v4i32_v4i32__7_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3671,15 +3712,15 @@ define void @v_shuffle_v4i32_v4i32__7_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v4, v1 -; GFX940-NEXT: v_mov_b32_e32 v5, v1 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -3777,7 +3818,8 @@ define void @v_shuffle_v4i32_v4i32__7_6_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: v_mov_b32_e32 v4, v1 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -3795,7 +3837,8 @@ define void @v_shuffle_v4i32_v4i32__7_6_1_1(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: v_mov_b32_e32 v4, v1 ; GFX940-NEXT: v_mov_b32_e32 v5, v1 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -4401,7 +4444,8 @@ define void @v_shuffle_v4i32_v4i32__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -4414,7 +4458,8 @@ define void @v_shuffle_v4i32_v4i32__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -4495,7 +4540,8 @@ define void @v_shuffle_v4i32_v4i32__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -4508,7 +4554,8 @@ define void @v_shuffle_v4i32_v4i32__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -4592,7 +4639,8 @@ define void @v_shuffle_v4i32_v4i32__5_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -4608,8 +4656,8 @@ define void @v_shuffle_v4i32_v4i32__5_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -4707,7 +4755,8 @@ define void @v_shuffle_v4i32_v4i32__7_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -4723,8 +4772,8 @@ define void @v_shuffle_v4i32_v4i32__7_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -4814,15 +4863,17 @@ define void @v_shuffle_v4i32_v4i32__7_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4830,15 +4881,17 @@ define void @v_shuffle_v4i32_v4i32__7_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0] -; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -4989,7 +5042,8 @@ define void @v_shuffle_v4i32_v4i32__7_4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -5006,7 +5060,8 @@ define void @v_shuffle_v4i32_v4i32__7_4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -5103,7 +5158,8 @@ define void @v_shuffle_v4i32_v4i32__7_6_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -5120,7 +5176,8 @@ define void @v_shuffle_v4i32_v4i32__7_6_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -5331,16 +5388,17 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5348,16 +5406,17 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5390,16 +5449,17 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5407,16 +5467,17 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5507,16 +5568,16 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v7 +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5524,17 +5585,16 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v10, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v7 +; GFX940-NEXT: v_mov_b32_e32 v8, v5 +; GFX940-NEXT: v_mov_b32_e32 v9, v2 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -6133,15 +6193,17 @@ define void @v_shuffle_v4i32_v4i32__7_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6149,15 +6211,17 @@ define void @v_shuffle_v4i32_v4i32__7_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0] -; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -6250,7 +6314,8 @@ define void @v_shuffle_v4i32_v4i32__7_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6266,8 +6331,8 @@ define void @v_shuffle_v4i32_v4i32__7_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 ; GFX940-NEXT: v_mov_b32_e32 v2, v3 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -6307,7 +6372,8 @@ define void @v_shuffle_v4i32_v4i32__7_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6324,7 +6390,8 @@ define void @v_shuffle_v4i32_v4i32__7_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -6421,7 +6488,8 @@ define void @v_shuffle_v4i32_v4i32__7_6_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6438,7 +6506,8 @@ define void @v_shuffle_v4i32_v4i32__7_6_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -7126,7 +7195,8 @@ define void @v_shuffle_v4i32_v4i32__5_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -7140,7 +7210,8 @@ define void @v_shuffle_v4i32_v4i32__5_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 ; GFX940-NEXT: v_mov_b32_e32 v4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -7226,7 +7297,8 @@ define void @v_shuffle_v4i32_v4i32__7_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -7240,7 +7312,8 @@ define void @v_shuffle_v4i32_v4i32__7_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 ; GFX940-NEXT: v_mov_b32_e32 v4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -7331,7 +7404,8 @@ define void @v_shuffle_v4i32_v4i32__7_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_mov_b32_e32 v6, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, v2 ; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] @@ -7349,7 +7423,8 @@ define void @v_shuffle_v4i32_v4i32__7_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: v_mov_b32_e32 v6, v2 ; GFX940-NEXT: v_mov_b32_e32 v7, v2 ; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 @@ -7448,7 +7523,8 @@ define void @v_shuffle_v4i32_v4i32__7_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] @@ -7465,8 +7541,8 @@ define void @v_shuffle_v4i32_v4i32__7_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 ; GFX940-NEXT: v_mov_b32_e32 v2, v4 ; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 @@ -7610,28 +7686,31 @@ define void @v_shuffle_v4i32_v4i32__7_6_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_6_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_6_4_4: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v4, v0 -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7825,11 +7904,11 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7839,15 +7918,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7944,9 +8022,10 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v7 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -7961,10 +8040,10 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[4:5] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v3 ; GFX940-NEXT: v_mov_b32_e32 v0, v7 ; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -7997,8 +8076,9 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -8010,8 +8090,9 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -8622,10 +8703,12 @@ define void @v_shuffle_v4i32_v4i32__7_0_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8635,14 +8718,16 @@ define void @v_shuffle_v4i32_v4i32__7_0_5_5(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -8737,7 +8822,8 @@ define void @v_shuffle_v4i32_v4i32__7_2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, v5 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] @@ -8754,8 +8840,8 @@ define void @v_shuffle_v4i32_v4i32__7_2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 ; GFX940-NEXT: v_mov_b32_e32 v2, v5 ; GFX940-NEXT: v_mov_b32_e32 v3, v5 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 @@ -8852,7 +8938,8 @@ define void @v_shuffle_v4i32_v4i32__7_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v1 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -8866,7 +8953,8 @@ define void @v_shuffle_v4i32_v4i32__7_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 ; GFX940-NEXT: v_mov_b32_e32 v4, v1 ; GFX940-NEXT: v_mov_b32_e32 v5, v1 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -8897,28 +8985,31 @@ define void @v_shuffle_v4i32_v4i32__7_6_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_6_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_6_5_5: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v4, v1 -; GFX940-NEXT: v_mov_b32_e32 v5, v1 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -9493,7 +9584,8 @@ define void @v_shuffle_v4i32_v4i32__1_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -9510,7 +9602,8 @@ define void @v_shuffle_v4i32_v4i32__1_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: v_mov_b32_e32 v5, v4 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -9608,7 +9701,8 @@ define void @v_shuffle_v4i32_v4i32__3_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[6:7] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 ; GFX90A-NEXT: v_mov_b32_e32 v7, v6 ; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -9624,8 +9718,8 @@ define void @v_shuffle_v4i32_v4i32__3_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[6:7] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v5, v6 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 ; GFX940-NEXT: v_mov_b32_e32 v7, v6 ; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -9705,7 +9799,8 @@ define void @v_shuffle_v4i32_v4i32__5_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -9718,7 +9813,8 @@ define void @v_shuffle_v4i32_v4i32__5_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -9801,7 +9897,8 @@ define void @v_shuffle_v4i32_v4i32__7_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -9814,7 +9911,8 @@ define void @v_shuffle_v4i32_v4i32__7_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -9900,7 +9998,8 @@ define void @v_shuffle_v4i32_v4i32__7_0_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -9917,7 +10016,8 @@ define void @v_shuffle_v4i32_v4i32__7_0_6_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v4 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -10016,7 +10116,8 @@ define void @v_shuffle_v4i32_v4i32__7_2_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, v6 ; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -10032,8 +10133,8 @@ define void @v_shuffle_v4i32_v4i32__7_2_6_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 ; GFX940-NEXT: v_mov_b32_e32 v7, v6 ; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -10121,26 +10222,31 @@ define void @v_shuffle_v4i32_v4i32__7_4_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_4_6_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_4_6_6: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -10380,14 +10486,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v4 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10397,15 +10504,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v10, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v8, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v5 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v9, v4 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -10503,9 +10610,10 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v7 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10520,10 +10628,10 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[6:7] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v3 ; GFX940-NEXT: v_mov_b32_e32 v0, v7 ; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v3, v6 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -10606,26 +10714,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_5_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_5_6: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -11133,7 +11246,8 @@ define void @v_shuffle_v4i32_v4i32__7_0_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v5 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -11150,7 +11264,8 @@ define void @v_shuffle_v4i32_v4i32__7_0_7_7(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 ; GFX940-NEXT: v_mov_b32_e32 v4, v5 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -11249,7 +11364,8 @@ define void @v_shuffle_v4i32_v4i32__7_2_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 ; GFX90A-NEXT: v_mov_b32_e32 v6, v7 ; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -11265,8 +11381,8 @@ define void @v_shuffle_v4i32_v4i32__7_2_7_7(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 ; GFX940-NEXT: v_mov_b32_e32 v6, v7 ; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -11355,26 +11471,31 @@ define void @v_shuffle_v4i32_v4i32__7_4_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_4_7_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_4_7_7: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -11452,7 +11573,8 @@ define void @v_shuffle_v4i32_v4i32__7_6_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -11465,7 +11587,8 @@ define void @v_shuffle_v4i32_v4i32__7_6_7_7(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: v_mov_b32_e32 v2, v3 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll index f9253ca1b1ea4..2dd794404158c 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll @@ -176,7 +176,8 @@ define void @v_shuffle_v4p3_v2p3__3_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -191,8 +192,8 @@ define void @v_shuffle_v4p3_v2p3__3_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -565,15 +566,16 @@ define void @v_shuffle_v4p3_v2p3__3_3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -581,15 +583,16 @@ define void @v_shuffle_v4p3_v2p3__3_3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ; def v[4:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -672,26 +675,31 @@ define void @v_shuffle_v4p3_v2p3__3_3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_3_3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_3_3_2: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ; def v[4:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -868,7 +876,8 @@ define void @v_shuffle_v4p3_v2p3__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -881,7 +890,8 @@ define void @v_shuffle_v4p3_v2p3__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -968,7 +978,8 @@ define void @v_shuffle_v4p3_v2p3__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -984,7 +995,8 @@ define void @v_shuffle_v4p3_v2p3__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1132,13 +1144,14 @@ define void @v_shuffle_v4p3_v2p3__3_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1148,13 +1161,14 @@ define void @v_shuffle_v4p3_v2p3__3_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ; def v[4:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1295,15 +1309,16 @@ define void @v_shuffle_v4p3_v2p3__3_3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1311,15 +1326,16 @@ define void @v_shuffle_v4p3_v2p3__3_3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ; def v[4:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1713,7 +1729,8 @@ define void @v_shuffle_v4p3_v2p3__3_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1729,7 +1746,8 @@ define void @v_shuffle_v4p3_v2p3__3_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: v_mov_b32_e32 v2, v3 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1764,13 +1782,14 @@ define void @v_shuffle_v4p3_v2p3__3_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1780,13 +1799,14 @@ define void @v_shuffle_v4p3_v2p3__3_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ; def v[4:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2138,7 +2158,8 @@ define void @v_shuffle_v4p3_v2p3__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -2151,7 +2172,8 @@ define void @v_shuffle_v4p3_v2p3__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -2230,15 +2252,16 @@ define void @v_shuffle_v4p3_v2p3__3_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2246,15 +2269,16 @@ define void @v_shuffle_v4p3_v2p3__3_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ; def v[4:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2498,13 +2522,15 @@ define void @v_shuffle_v4p3_v2p3__3_3_1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2514,13 +2540,15 @@ define void @v_shuffle_v4p3_v2p3__3_3_1_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ; def v[4:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2804,15 +2832,16 @@ define void @v_shuffle_v4p3_v2p3__3_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2820,15 +2849,16 @@ define void @v_shuffle_v4p3_v2p3__3_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ; def v[4:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2914,7 +2944,8 @@ define void @v_shuffle_v4p3_v2p3__3_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -2927,7 +2958,8 @@ define void @v_shuffle_v4p3_v2p3__3_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: v_mov_b32_e32 v2, v3 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll index 28bc61ce57815..d6bd854283180 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll @@ -1280,7 +1280,8 @@ define void @v_shuffle_v4p3_v3p3__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -1295,7 +1296,8 @@ define void @v_shuffle_v4p3_v3p3__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[4:6] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: v_mov_b32_e32 v2, v4 ; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 @@ -1433,7 +1435,8 @@ define void @v_shuffle_v4p3_v3p3__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -1450,8 +1453,8 @@ define void @v_shuffle_v4p3_v3p3__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:6] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: v_mov_b32_e32 v2, v4 ; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 @@ -1961,14 +1964,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1978,15 +1982,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ; def v[4:6] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2138,16 +2142,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2155,17 +2160,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_0(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ; def v[4:6] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v9, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ; def v[6:8] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3272,9 +3277,9 @@ define void @v_shuffle_v4p3_v3p3__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -3287,8 +3292,8 @@ define void @v_shuffle_v4p3_v3p3__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v2 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -3416,12 +3421,12 @@ define void @v_shuffle_v4p3_v3p3__4_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -3434,12 +3439,12 @@ define void @v_shuffle_v4p3_v3p3__4_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v7, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v2 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:6] ; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -3986,17 +3991,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4004,17 +4009,18 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ; def v[2:4] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v7, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4106,17 +4112,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4124,18 +4130,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ; def v[2:4] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v7, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ; def v[6:8] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4312,7 +4317,8 @@ define void @v_shuffle_v4p3_v3p3__4_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -4327,7 +4333,8 @@ define void @v_shuffle_v4p3_v3p3__4_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[4:6] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: v_mov_b32_e32 v2, v4 ; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 @@ -4851,16 +4858,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4868,17 +4876,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_3(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ; def v[2:4] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ; def v[4:6] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4964,28 +4972,31 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_4_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_4_3: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ; def v[4:6] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v5, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6083,9 +6094,9 @@ define void @v_shuffle_v4p3_v3p3__1_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6102,8 +6113,8 @@ define void @v_shuffle_v4p3_v3p3__1_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v2 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -6241,9 +6252,9 @@ define void @v_shuffle_v4p3_v3p3__4_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6256,8 +6267,8 @@ define void @v_shuffle_v4p3_v3p3__4_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v2 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -6716,16 +6727,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6733,17 +6745,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ; def v[2:4] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ; def v[4:6] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v4 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v6 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6881,10 +6893,11 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -6897,8 +6910,9 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_5(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v5, 0 ; GFX940-NEXT: v_mov_b32_e32 v0, v4 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll index 9cc1b9fe6cf0e..e4c7cebb231bd 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll @@ -343,12 +343,13 @@ define void @v_shuffle_v4p3_v4p3__7_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -363,8 +364,9 @@ define void @v_shuffle_v4p3_v4p3__7_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -454,7 +456,8 @@ define void @v_shuffle_v4p3_v4p3__7_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -469,8 +472,8 @@ define void @v_shuffle_v4p3_v4p3__7_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -557,8 +560,9 @@ define void @v_shuffle_v4p3_v4p3__7_4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -569,8 +573,9 @@ define void @v_shuffle_v4p3_v4p3__7_4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] -; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -644,7 +649,8 @@ define void @v_shuffle_v4p3_v4p3__7_6_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -656,7 +662,8 @@ define void @v_shuffle_v4p3_v4p3__7_6_u_u(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -1162,10 +1169,11 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1180,10 +1188,11 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1278,16 +1287,17 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v7 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1295,17 +1305,17 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v7 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1392,26 +1402,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_7_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_7_4: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1493,26 +1508,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_7_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_7_6: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1689,7 +1709,8 @@ define void @v_shuffle_v4p3_v4p3__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -1703,7 +1724,8 @@ define void @v_shuffle_v4p3_v4p3__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 ; GFX940-NEXT: v_mov_b32_e32 v4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -1787,7 +1809,8 @@ define void @v_shuffle_v4p3_v4p3__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -1801,7 +1824,8 @@ define void @v_shuffle_v4p3_v4p3__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 ; GFX940-NEXT: v_mov_b32_e32 v4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -1890,7 +1914,8 @@ define void @v_shuffle_v4p3_v4p3__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -1908,7 +1933,8 @@ define void @v_shuffle_v4p3_v4p3__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 ; GFX940-NEXT: v_mov_b32_e32 v4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -2012,7 +2038,8 @@ define void @v_shuffle_v4p3_v4p3__7_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -2030,7 +2057,8 @@ define void @v_shuffle_v4p3_v4p3__7_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 ; GFX940-NEXT: v_mov_b32_e32 v4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -2186,16 +2214,17 @@ define void @v_shuffle_v4p3_v4p3__7_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2203,16 +2232,17 @@ define void @v_shuffle_v4p3_v4p3__7_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v4, v0 -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2309,11 +2339,12 @@ define void @v_shuffle_v4p3_v4p3__7_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2323,15 +2354,15 @@ define void @v_shuffle_v4p3_v4p3__7_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v4, v0 -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2429,7 +2460,8 @@ define void @v_shuffle_v4p3_v4p3__7_6_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -2447,7 +2479,8 @@ define void @v_shuffle_v4p3_v4p3__7_6_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: v_mov_b32_e32 v4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -2610,10 +2643,11 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2628,10 +2662,11 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2724,16 +2759,17 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2741,16 +2777,17 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_0(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2846,11 +2883,11 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2860,15 +2897,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3481,7 +3517,8 @@ define void @v_shuffle_v4p3_v4p3__7_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v1 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -3499,7 +3536,8 @@ define void @v_shuffle_v4p3_v4p3__7_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 ; GFX940-NEXT: v_mov_b32_e32 v4, v1 ; GFX940-NEXT: v_mov_b32_e32 v5, v1 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -3535,16 +3573,17 @@ define void @v_shuffle_v4p3_v4p3__7_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3552,16 +3591,17 @@ define void @v_shuffle_v4p3_v4p3__7_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v4, v1 -; GFX940-NEXT: v_mov_b32_e32 v5, v1 -; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3657,11 +3697,12 @@ define void @v_shuffle_v4p3_v4p3__7_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3671,15 +3712,15 @@ define void @v_shuffle_v4p3_v4p3__7_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v4, v1 -; GFX940-NEXT: v_mov_b32_e32 v5, v1 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3777,7 +3818,8 @@ define void @v_shuffle_v4p3_v4p3__7_6_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: v_mov_b32_e32 v4, v1 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -3795,7 +3837,8 @@ define void @v_shuffle_v4p3_v4p3__7_6_1_1(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: v_mov_b32_e32 v4, v1 ; GFX940-NEXT: v_mov_b32_e32 v5, v1 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -4401,7 +4444,8 @@ define void @v_shuffle_v4p3_v4p3__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -4414,7 +4458,8 @@ define void @v_shuffle_v4p3_v4p3__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -4495,7 +4540,8 @@ define void @v_shuffle_v4p3_v4p3__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -4508,7 +4554,8 @@ define void @v_shuffle_v4p3_v4p3__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -4592,7 +4639,8 @@ define void @v_shuffle_v4p3_v4p3__5_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -4608,8 +4656,8 @@ define void @v_shuffle_v4p3_v4p3__5_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -4707,7 +4755,8 @@ define void @v_shuffle_v4p3_v4p3__7_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -4723,8 +4772,8 @@ define void @v_shuffle_v4p3_v4p3__7_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -4814,15 +4863,17 @@ define void @v_shuffle_v4p3_v4p3__7_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4830,15 +4881,17 @@ define void @v_shuffle_v4p3_v4p3__7_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0] -; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4989,7 +5042,8 @@ define void @v_shuffle_v4p3_v4p3__7_4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -5006,7 +5060,8 @@ define void @v_shuffle_v4p3_v4p3__7_4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -5103,7 +5158,8 @@ define void @v_shuffle_v4p3_v4p3__7_6_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -5120,7 +5176,8 @@ define void @v_shuffle_v4p3_v4p3__7_6_2_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -5331,16 +5388,17 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5348,16 +5406,17 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5390,16 +5449,17 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5407,16 +5467,17 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5507,16 +5568,16 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v7 +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5524,17 +5585,16 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_2(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v10, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v7 +; GFX940-NEXT: v_mov_b32_e32 v8, v5 +; GFX940-NEXT: v_mov_b32_e32 v9, v2 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6133,15 +6193,17 @@ define void @v_shuffle_v4p3_v4p3__7_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6149,15 +6211,17 @@ define void @v_shuffle_v4p3_v4p3__7_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0] -; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6250,7 +6314,8 @@ define void @v_shuffle_v4p3_v4p3__7_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6266,8 +6331,8 @@ define void @v_shuffle_v4p3_v4p3__7_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 ; GFX940-NEXT: v_mov_b32_e32 v2, v3 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -6307,7 +6372,8 @@ define void @v_shuffle_v4p3_v4p3__7_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6324,7 +6390,8 @@ define void @v_shuffle_v4p3_v4p3__7_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -6421,7 +6488,8 @@ define void @v_shuffle_v4p3_v4p3__7_6_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6438,7 +6506,8 @@ define void @v_shuffle_v4p3_v4p3__7_6_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -7126,7 +7195,8 @@ define void @v_shuffle_v4p3_v4p3__5_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -7140,7 +7210,8 @@ define void @v_shuffle_v4p3_v4p3__5_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 ; GFX940-NEXT: v_mov_b32_e32 v4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -7226,7 +7297,8 @@ define void @v_shuffle_v4p3_v4p3__7_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -7240,7 +7312,8 @@ define void @v_shuffle_v4p3_v4p3__7_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 ; GFX940-NEXT: v_mov_b32_e32 v4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -7331,7 +7404,8 @@ define void @v_shuffle_v4p3_v4p3__7_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_mov_b32_e32 v6, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, v2 ; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] @@ -7349,7 +7423,8 @@ define void @v_shuffle_v4p3_v4p3__7_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: v_mov_b32_e32 v6, v2 ; GFX940-NEXT: v_mov_b32_e32 v7, v2 ; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 @@ -7448,7 +7523,8 @@ define void @v_shuffle_v4p3_v4p3__7_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] @@ -7465,8 +7541,8 @@ define void @v_shuffle_v4p3_v4p3__7_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 ; GFX940-NEXT: v_mov_b32_e32 v2, v4 ; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 @@ -7610,28 +7686,31 @@ define void @v_shuffle_v4p3_v4p3__7_6_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_6_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_6_4_4: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v4, v0 -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7825,11 +7904,11 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7839,15 +7918,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7944,9 +8022,10 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v7 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -7961,10 +8040,10 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[4:5] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v3 ; GFX940-NEXT: v_mov_b32_e32 v0, v7 ; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -7997,8 +8076,9 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -8010,8 +8090,9 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -8622,10 +8703,12 @@ define void @v_shuffle_v4p3_v4p3__7_0_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8635,14 +8718,16 @@ define void @v_shuffle_v4p3_v4p3__7_0_5_5(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -8737,7 +8822,8 @@ define void @v_shuffle_v4p3_v4p3__7_2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, v5 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] @@ -8754,8 +8840,8 @@ define void @v_shuffle_v4p3_v4p3__7_2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 ; GFX940-NEXT: v_mov_b32_e32 v2, v5 ; GFX940-NEXT: v_mov_b32_e32 v3, v5 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 @@ -8852,7 +8938,8 @@ define void @v_shuffle_v4p3_v4p3__7_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v1 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] @@ -8866,7 +8953,8 @@ define void @v_shuffle_v4p3_v4p3__7_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 ; GFX940-NEXT: v_mov_b32_e32 v4, v1 ; GFX940-NEXT: v_mov_b32_e32 v5, v1 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 @@ -8897,28 +8985,31 @@ define void @v_shuffle_v4p3_v4p3__7_6_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_6_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_6_5_5: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v4, v1 -; GFX940-NEXT: v_mov_b32_e32 v5, v1 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -9493,7 +9584,8 @@ define void @v_shuffle_v4p3_v4p3__1_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -9510,7 +9602,8 @@ define void @v_shuffle_v4p3_v4p3__1_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: v_mov_b32_e32 v5, v4 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -9608,7 +9701,8 @@ define void @v_shuffle_v4p3_v4p3__3_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[6:7] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 ; GFX90A-NEXT: v_mov_b32_e32 v7, v6 ; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -9624,8 +9718,8 @@ define void @v_shuffle_v4p3_v4p3__3_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[6:7] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v5, v6 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 ; GFX940-NEXT: v_mov_b32_e32 v7, v6 ; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -9705,7 +9799,8 @@ define void @v_shuffle_v4p3_v4p3__5_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -9718,7 +9813,8 @@ define void @v_shuffle_v4p3_v4p3__5_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -9801,7 +9897,8 @@ define void @v_shuffle_v4p3_v4p3__7_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -9814,7 +9911,8 @@ define void @v_shuffle_v4p3_v4p3__7_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -9900,7 +9998,8 @@ define void @v_shuffle_v4p3_v4p3__7_0_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -9917,7 +10016,8 @@ define void @v_shuffle_v4p3_v4p3__7_0_6_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, v4 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -10016,7 +10116,8 @@ define void @v_shuffle_v4p3_v4p3__7_2_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, v6 ; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -10032,8 +10133,8 @@ define void @v_shuffle_v4p3_v4p3__7_2_6_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 ; GFX940-NEXT: v_mov_b32_e32 v7, v6 ; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -10121,26 +10222,31 @@ define void @v_shuffle_v4p3_v4p3__7_4_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_4_6_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_4_6_6: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -10380,14 +10486,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v4 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10397,15 +10504,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v10, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v8, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v5 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v9, v4 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -10503,9 +10610,10 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v7 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10520,10 +10628,10 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[6:7] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v3 ; GFX940-NEXT: v_mov_b32_e32 v0, v7 ; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v3, v6 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -10606,26 +10714,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_5_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_5_6: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[2:3] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -11133,7 +11246,8 @@ define void @v_shuffle_v4p3_v4p3__7_0_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v5 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -11150,7 +11264,8 @@ define void @v_shuffle_v4p3_v4p3__7_0_7_7(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 ; GFX940-NEXT: v_mov_b32_e32 v4, v5 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -11249,7 +11364,8 @@ define void @v_shuffle_v4p3_v4p3__7_2_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 ; GFX90A-NEXT: v_mov_b32_e32 v6, v7 ; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -11265,8 +11381,8 @@ define void @v_shuffle_v4p3_v4p3__7_2_7_7(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 ; GFX940-NEXT: v_mov_b32_e32 v6, v7 ; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -11355,26 +11471,31 @@ define void @v_shuffle_v4p3_v4p3__7_4_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_4_7_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_4_7_7: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -11452,7 +11573,8 @@ define void @v_shuffle_v4p3_v4p3__7_6_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -11465,7 +11587,8 @@ define void @v_shuffle_v4p3_v4p3__7_6_7_7(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: v_mov_b32_e32 v2, v3 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0)