From 0ae8aaff78ebd226f727540256c1c3d2867af59d Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Wed, 16 Jul 2025 17:11:40 -0700 Subject: [PATCH 1/2] Reapply "[AMDGPU][MC] Allow op_sel in v_alignbit_b32 etc in GFX9 and GFX10 (#142188)" This reverts commit ba271cc07334c74df55741701e5b22032c0cddbb. --- llvm/lib/Target/AMDGPU/SIInstructions.td | 47 +++++++++++++++++-- llvm/lib/Target/AMDGPU/VOP3Instructions.td | 39 ++++++++++++++- .../AMDGPU/GlobalISel/inst-select-bswap.mir | 19 ++++++++ .../AMDGPU/GlobalISel/inst-select-fshr.mir | 22 ++++++++- .../branch-folding-implicit-def-subreg.ll | 6 +-- .../CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll | 31 ++++++++++++ llvm/test/MC/AMDGPU/gfx10_asm_vop3.s | 24 ++++++++++ llvm/test/MC/AMDGPU/gfx7_err_pos.s | 13 +++++ llvm/test/MC/AMDGPU/gfx8_err_pos.s | 10 ++++ llvm/test/MC/AMDGPU/gfx9_asm_vop3_e64.s | 24 ++++++++++ .../MC/Disassembler/AMDGPU/gfx10_vop3.txt | 24 ++++++++++ .../test/MC/Disassembler/AMDGPU/gfx9_vop3.txt | 24 ++++++++++ 12 files changed, 271 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 2a6fcadd4c49c..d48eb52d2faae 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2473,6 +2473,7 @@ def : AMDGPUPat < >; let True16Predicate = NotHasTrue16BitInsts in { +let SubtargetPredicate = isNotGFX9Plus in { def : ROTRPattern ; def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))), @@ -2482,6 +2483,35 @@ def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))), def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))), (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>; +} // isNotGFX9Plus + +let SubtargetPredicate = isGFX9GFX10 in { +def : GCNPat < + (rotr i32:$src0, i32:$src1), + (V_ALIGNBIT_B32_opsel_e64 /* src0_modifiers */ 0, $src0, + /* src1_modifiers */ 0, $src0, + /* src2_modifiers */ 0, + $src1, /* clamp */ 0, /* op_sel */ 0) +>; + +foreach pat = [(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))), + (i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1))))] in +def : GCNPat; + +def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2), + (V_ALIGNBIT_B32_opsel_e64 /* src0_modifiers */ 0, $src0, + /* src1_modifiers */ 0, $src1, + /* src2_modifiers */ 0, + $src2, /* clamp */ 0, /* op_sel */ 0) +>; +} // isGFX9GFX10 } // end True16Predicate = NotHasTrue16BitInsts let True16Predicate = UseRealTrue16Insts in { @@ -3082,6 +3112,8 @@ def : GCNPat < (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1)) >; +// This pattern for bswap is used for pre-GFX8. For GFX8+, bswap is mapped +// to V_PERM_B32. let True16Predicate = NotHasTrue16BitInsts in def : GCNPat < (i32 (bswap i32:$a)), @@ -3557,15 +3589,20 @@ def : GCNPat < // Take the upper 16 bits from V[0] and the lower 16 bits from V[1] // Special case, can use V_ALIGNBIT (always uses encoded literal) -let True16Predicate = NotHasTrue16BitInsts in -def : GCNPat < +let True16Predicate = NotHasTrue16BitInsts in { +defvar BuildVectorToAlignBitPat = (vecTy (DivergentBinFrag (Ty !if(!eq(Ty, i16), (Ty (trunc (srl VGPR_32:$a, (i32 16)))), (Ty (bitconvert (i16 (trunc (srl VGPR_32:$a, (i32 16)))))))), - (Ty VGPR_32:$b))), - (V_ALIGNBIT_B32_e64 VGPR_32:$b, VGPR_32:$a, (i32 16)) ->; + (Ty VGPR_32:$b))); + +let SubtargetPredicate = isNotGFX9Plus in +def : GCNPat; + +let SubtargetPredicate = isGFX9GFX10 in +def : GCNPat; +} //True16Predicate = NotHasTrue16BitInsts let True16Predicate = UseFakeTrue16Insts in def : GCNPat < diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 2e7f25b67fb63..aee2f2cb3d771 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -224,6 +224,12 @@ defm V_ALIGNBIT_B32 : VOP3Inst_t16_with_profiles <"v_alignbit_b32", fshr, null_frag>; defm V_ALIGNBYTE_B32 : VOP3Inst <"v_alignbyte_b32", VOP3_Profile, int_amdgcn_alignbyte>; + +// In gfx9 and 10, opsel is allowed for V_ALIGNBIT_B32 and V_ALIGNBYTE_B32. +// Hardware uses opsel[1:0] to byte-select src2. Other opsel bits are ignored. +defm V_ALIGNBIT_B32_opsel : VOP3Inst <"v_alignbit_b32_opsel", VOP3_Profile>; +defm V_ALIGNBYTE_B32_opsel : VOP3Inst <"v_alignbyte_b32_opsel", VOP3_Profile>; + let True16Predicate = UseRealTrue16Insts in defm V_ALIGNBYTE_B32_t16 : VOP3Inst <"v_alignbyte_b32_t16", VOP3_Profile_True16>; let True16Predicate = UseFakeTrue16Insts in @@ -265,6 +271,16 @@ let SchedRW = [WriteDoubleAdd], FPDPRounding = 1 in { } // End SchedRW = [WriteDoubleAdd], FPDPRounding = 1 } // End isReMaterializable = 1 +let SubtargetPredicate = isGFX9GFX10 in +def : GCNPat < +(i32 (int_amdgcn_alignbyte (i32 (VOP3OpSelMods i32:$src0, i32:$src0_modifiers)), + (i32 (VOP3OpSelMods i32:$src1, i32:$src1_modifiers)), + (i32 (VOP3OpSelMods i32:$src2, i32:$src2_modifiers)))), +(V_ALIGNBYTE_B32_opsel_e64 i32:$src0_modifiers, VSrc_b32:$src0, + i32:$src1_modifiers, VSrc_b32:$src1, + i32:$src2_modifiers, VGPR_32:$src2) +>; + let True16Predicate = UseFakeTrue16Insts in def : GCNPat < (i32 (int_amdgcn_alignbyte (i32 (VOP3OpSelMods i32:$src0, i32:$src0_modifiers)), @@ -1954,6 +1970,9 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { } } // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" +defm V_ALIGNBIT_B32_opsel : VOP3OpSel_Real_gfx10_with_name<0x14e, "V_ALIGNBIT_B32_opsel", "v_alignbit_b32">; +defm V_ALIGNBYTE_B32_opsel : VOP3OpSel_Real_gfx10_with_name<0x14f, "V_ALIGNBYTE_B32_opsel", "v_alignbyte_b32">; + defm V_READLANE_B32 : VOP3_Real_No_Suffix_gfx10<0x360>; let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) in { @@ -2104,8 +2123,8 @@ defm V_BFI_B32 : VOP3_Real_gfx6_gfx7_gfx10<0x14a>; defm V_FMA_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x14b>; defm V_FMA_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x14c>; defm V_LERP_U8 : VOP3_Real_gfx6_gfx7_gfx10<0x14d>; -defm V_ALIGNBIT_B32 : VOP3_Real_gfx6_gfx7_gfx10<0x14e>; -defm V_ALIGNBYTE_B32 : VOP3_Real_gfx6_gfx7_gfx10<0x14f>; +defm V_ALIGNBIT_B32 : VOP3_Real_gfx6_gfx7<0x14e>; +defm V_ALIGNBYTE_B32 : VOP3_Real_gfx6_gfx7<0x14f>; defm V_MULLIT_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x150>; defm V_MIN3_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x151>; defm V_MIN3_I32 : VOP3_Real_gfx6_gfx7_gfx10<0x152>; @@ -2248,6 +2267,17 @@ multiclass VOP3_Real_BITOP3_gfx9 op, string AsmName, bit isSingle = 0> } } +// Instructions such as v_alignbyte_b32 allows op_sel in gfx9, but not in vi. +// The following is created to support that. +multiclass VOP3OpSel_Real_gfx9_with_name op, string opName, string AsmName> { + defvar psName = opName#"_e64"; + def _gfx9 : VOP3_Real(psName), SIEncodingFamily.VI>, // note: encoding family is VI + VOP3OpSel_gfx9 (psName).Pfl> { + VOP3_Pseudo ps = !cast(psName); + let AsmString = AsmName # ps.AsmOperands; + } +} + } // End AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" defm V_MAD_U64_U32 : VOP3be_Real_vi <0x1E8>; @@ -2267,8 +2297,10 @@ defm V_BFI_B32 : VOP3_Real_vi <0x1ca>; defm V_FMA_F32 : VOP3_Real_vi <0x1cb>; defm V_FMA_F64 : VOP3_Real_vi <0x1cc>; defm V_LERP_U8 : VOP3_Real_vi <0x1cd>; +let SubtargetPredicate = isGFX8Only in { defm V_ALIGNBIT_B32 : VOP3_Real_vi <0x1ce>; defm V_ALIGNBYTE_B32 : VOP3_Real_vi <0x1cf>; +} defm V_MIN3_F32 : VOP3_Real_vi <0x1d0>; defm V_MIN3_I32 : VOP3_Real_vi <0x1d1>; defm V_MIN3_U32 : VOP3_Real_vi <0x1d2>; @@ -2313,6 +2345,9 @@ defm V_INTERP_P2_LEGACY_F16 : VOP3Interp_F16_Real_gfx9 <0x276, "V_INTERP_P2_F16" defm V_MAD_LEGACY_U16 : VOP3_F16_Real_gfx9 <0x1eb, "V_MAD_U16", "v_mad_legacy_u16">; defm V_MAD_LEGACY_I16 : VOP3_F16_Real_gfx9 <0x1ec, "V_MAD_I16", "v_mad_legacy_i16">; +defm V_ALIGNBIT_B32_opsel : VOP3OpSel_Real_gfx9_with_name <0x1ce, "V_ALIGNBIT_B32_opsel", "v_alignbit_b32">; +defm V_ALIGNBYTE_B32_opsel : VOP3OpSel_Real_gfx9_with_name <0x1cf, "V_ALIGNBYTE_B32_opsel", "v_alignbyte_b32">; + defm V_MAD_F16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x203, "v_mad_f16">; defm V_MAD_U16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x204, "v_mad_u16">; defm V_MAD_I16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x205, "v_mad_i16">; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bswap.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bswap.mir index 5b8c2840b0156..dde566d9643d8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bswap.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bswap.mir @@ -1,6 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX7 %s # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX8 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s --- name: bswap_i32_vv @@ -19,6 +21,7 @@ body: | ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16711935 ; GFX7-NEXT: [[V_BFI_B32_e64_:%[0-9]+]]:vgpr_32 = V_BFI_B32_e64 [[S_MOV_B32_]], [[V_ALIGNBIT_B32_e64_1]], [[V_ALIGNBIT_B32_e64_]], implicit $exec ; GFX7-NEXT: S_ENDPGM 0, implicit [[V_BFI_B32_e64_]] + ; ; GFX8-LABEL: name: bswap_i32_vv ; GFX8: liveins: $vgpr0 ; GFX8-NEXT: {{ $}} @@ -26,6 +29,22 @@ body: | ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 66051 ; GFX8-NEXT: [[V_PERM_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 0, [[COPY]], [[S_MOV_B32_]], implicit $exec ; GFX8-NEXT: S_ENDPGM 0, implicit [[V_PERM_B32_e64_]] + ; + ; GFX9-LABEL: name: bswap_i32_vv + ; GFX9: liveins: $vgpr0 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 66051 + ; GFX9-NEXT: [[V_PERM_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 0, [[COPY]], [[S_MOV_B32_]], implicit $exec + ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_PERM_B32_e64_]] + ; + ; GFX10-LABEL: name: bswap_i32_vv + ; GFX10: liveins: $vgpr0 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 66051 + ; GFX10-NEXT: [[V_PERM_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 0, [[COPY]], [[S_MOV_B32_]], implicit $exec + ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_PERM_B32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = G_BSWAP %0 S_ENDPGM 0, implicit %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir index 0a4cb3ccf2957..fa95f33909b76 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir @@ -1,8 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX11 %s --- @@ -24,6 +24,24 @@ body: | ; GCN-NEXT: [[V_ALIGNBIT_B32_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_e64 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec ; GCN-NEXT: S_ENDPGM 0, implicit [[V_ALIGNBIT_B32_e64_]] ; + ; GFX9-LABEL: name: fshr_s32 + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX9-NEXT: [[V_ALIGNBIT_B32_opsel_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_opsel_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec + ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_ALIGNBIT_B32_opsel_e64_]] + ; + ; GFX10-LABEL: name: fshr_s32 + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10-NEXT: [[V_ALIGNBIT_B32_opsel_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_opsel_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec + ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_ALIGNBIT_B32_opsel_e64_]] + ; ; GFX11-LABEL: name: fshr_s32 ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll index ae90cfb631e8d..92c63fead15ac 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -766,10 +766,10 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr22, implicit $exec ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr46, implicit $exec - ; GFX90A-NEXT: renamable $vgpr11 = V_ALIGNBIT_B32_e64 killed $sgpr47, killed $vgpr10, 1, implicit $exec - ; GFX90A-NEXT: renamable $vgpr52 = V_ALIGNBIT_B32_e64 $vgpr17, $vgpr16, 1, implicit $exec + ; GFX90A-NEXT: renamable $vgpr11 = V_ALIGNBIT_B32_opsel_e64 0, killed $sgpr47, 0, killed $vgpr10, 0, 1, 0, 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr52 = V_ALIGNBIT_B32_opsel_e64 0, $vgpr17, 0, $vgpr16, 0, 1, 0, 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr17 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec - ; GFX90A-NEXT: renamable $vgpr15 = V_ALIGNBIT_B32_e64 $vgpr15, $vgpr14, 1, implicit $exec + ; GFX90A-NEXT: renamable $vgpr15 = V_ALIGNBIT_B32_opsel_e64 0, $vgpr15, 0, $vgpr14, 0, 1, 0, 0, implicit $exec ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_XOR_B64 $exec, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $vgpr14, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll index b77b2f7441a0c..a4c25a593777b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s @@ -19,6 +20,18 @@ define amdgpu_kernel void @v_alignbyte_b32(ptr addrspace(1) %out, i32 %src1, i32 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; +; GFX10-LABEL: v_alignbyte_b32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_alignbyte_b32 v0, s0, s1, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; ; GFX11-TRUE16-LABEL: v_alignbyte_b32: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_clause 0x1 @@ -73,6 +86,24 @@ define amdgpu_kernel void @v_alignbyte_b32_2(ptr addrspace(1) %out, ptr addrspac ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; +; GFX10-LABEL: v_alignbyte_b32_2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x3c +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_alignbyte_b32 v0, v1, v0, s2 +; GFX10-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-NEXT: s_endpgm +; ; GFX11-TRUE16-LABEL: v_alignbyte_b32_2: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_clause 0x1 diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s index 6bb0f4b1dff2d..3d6af6ba6dbf8 100644 --- a/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s +++ b/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s @@ -3628,6 +3628,18 @@ v_alignbit_b32 v5, v1, v2, exec_lo v_alignbit_b32 v5, v1, v2, exec_hi // GFX10: encoding: [0x05,0x00,0x4e,0xd5,0x01,0x05,0xfe,0x01] +v_alignbit_b32 v5, v1, v2, v3 op_sel:[1] +// GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x4e,0xd5,0x01,0x05,0x0e,0x04] + +v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1] +// GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0x4e,0xd5,0x01,0x05,0x0e,0x04] + +v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1] +// GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0x4e,0xd5,0x01,0x05,0x0e,0x04] + +v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] +// GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x4e,0xd5,0x01,0x05,0x0e,0x04] + v_alignbyte_b32 v5, v1, v2, v3 // GFX10: encoding: [0x05,0x00,0x4f,0xd5,0x01,0x05,0x0e,0x04] @@ -3715,6 +3727,18 @@ v_alignbyte_b32 v5, v1, v2, exec_lo v_alignbyte_b32 v5, v1, v2, exec_hi // GFX10: encoding: [0x05,0x00,0x4f,0xd5,0x01,0x05,0xfe,0x01] +v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1] +// GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x4f,0xd5,0x01,0x05,0x0e,0x04] + +v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1] +// GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0x4f,0xd5,0x01,0x05,0x0e,0x04] + +v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1] +// GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0x4f,0xd5,0x01,0x05,0x0e,0x04] + +v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] +// GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x4f,0xd5,0x01,0x05,0x0e,0x04] + v_mullit_f32 v5, v1, v2, v3 // GFX10: encoding: [0x05,0x00,0x50,0xd5,0x01,0x05,0x0e,0x04] diff --git a/llvm/test/MC/AMDGPU/gfx7_err_pos.s b/llvm/test/MC/AMDGPU/gfx7_err_pos.s index 9dcbd4a4074af..7b6b241e04707 100644 --- a/llvm/test/MC/AMDGPU/gfx7_err_pos.s +++ b/llvm/test/MC/AMDGPU/gfx7_err_pos.s @@ -44,3 +44,16 @@ s_load_dword s5, s[2:3], glc // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: cache policy is not supported for SMRD instructions // CHECK-NEXT:{{^}}s_load_dword s5, s[2:3], glc // CHECK-NEXT:{{^}} ^ + +//============================================================================== +// not a valid operand + +v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// CHECK-NEXT:{{^}}v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] +// CHECK-NEXT:{{^}} ^ + +v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// CHECK-NEXT:{{^}}v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] +// CHECK-NEXT:{{^}} ^ diff --git a/llvm/test/MC/AMDGPU/gfx8_err_pos.s b/llvm/test/MC/AMDGPU/gfx8_err_pos.s index 1e8457d54049a..a475c739e690d 100644 --- a/llvm/test/MC/AMDGPU/gfx8_err_pos.s +++ b/llvm/test/MC/AMDGPU/gfx8_err_pos.s @@ -49,3 +49,13 @@ v_cndmask_b32_sdwa v5, v1, sext(v2), vcc dst_sel:DWORD dst_unused:UNUSED_PRESERV // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. // CHECK-NEXT:{{^}}v_cndmask_b32_sdwa v5, v1, sext(v2), vcc dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:BYTE_0 src1_sel:WORD_0 // CHECK-NEXT:{{^}} ^ + +v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// CHECK-NEXT:{{^}}v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] +// CHECK-NEXT:{{^}} ^ + +v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// CHECK-NEXT:{{^}}v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] +// CHECK-NEXT:{{^}} ^ diff --git a/llvm/test/MC/AMDGPU/gfx9_asm_vop3_e64.s b/llvm/test/MC/AMDGPU/gfx9_asm_vop3_e64.s index f3f4cae22538a..a1cd9ce8ef18e 100644 --- a/llvm/test/MC/AMDGPU/gfx9_asm_vop3_e64.s +++ b/llvm/test/MC/AMDGPU/gfx9_asm_vop3_e64.s @@ -2829,6 +2829,18 @@ v_alignbit_b32 v5, v1, v2, src_execz v_alignbit_b32 v5, v1, v2, src_scc // CHECK: [0x05,0x00,0xce,0xd1,0x01,0x05,0xf6,0x03] +v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0xce,0xd1,0x01,0x05,0x0e,0x04] +// CHECK: [0x05,0x08,0xce,0xd1,0x01,0x05,0x0e,0x04] + +v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0xce,0xd1,0x01,0x05,0x0e,0x04] +// CHECK: [0x05,0x18,0xce,0xd1,0x01,0x05,0x0e,0x04] + +v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0xce,0xd1,0x01,0x05,0x0e,0x04] +// CHECK: [0x05,0x38,0xce,0xd1,0x01,0x05,0x0e,0x04] + +v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0xce,0xd1,0x01,0x05,0x0e,0x04] +// CHECK: [0x05,0x78,0xce,0xd1,0x01,0x05,0x0e,0x04] + v_alignbyte_b32 v5, v1, v2, v3 // CHECK: [0x05,0x00,0xcf,0xd1,0x01,0x05,0x0e,0x04] @@ -3000,6 +3012,18 @@ v_alignbyte_b32 v5, v1, v2, src_execz v_alignbyte_b32 v5, v1, v2, src_scc // CHECK: [0x05,0x00,0xcf,0xd1,0x01,0x05,0xf6,0x03] +v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1] +// CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0xcf,0xd1,0x01,0x05,0x0e,0x04] + +v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1] +// CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0xcf,0xd1,0x01,0x05,0x0e,0x04] + +v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1] +// CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0xcf,0xd1,0x01,0x05,0x0e,0x04] + +v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] +// CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0xcf,0xd1,0x01,0x05,0x0e,0x04] + v_min3_f32 v5, v1, v2, v3 // CHECK: [0x05,0x00,0xd0,0xd1,0x01,0x05,0x0e,0x04] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt index 721babdd64245..08ed50d92ba83 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt @@ -1146,6 +1146,18 @@ # GFX10: v_alignbit_b32 v5, vcc_lo, v2, v3 ; encoding: [0x05,0x00,0x4e,0xd5,0x6a,0x04,0x0e,0x04] 0x05,0x00,0x4e,0xd5,0x6a,0x04,0x0e,0x04 +# GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x4e,0xd5,0x01,0x05,0x0e,0x04] +0x05,0x08,0x4e,0xd5,0x01,0x05,0x0e,0x04 + +# GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0x4e,0xd5,0x01,0x05,0x0e,0x04] +0x05,0x18,0x4e,0xd5,0x01,0x05,0x0e,0x04 + +# GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0x4e,0xd5,0x01,0x05,0x0e,0x04] +0x05,0x38,0x4e,0xd5,0x01,0x05,0x0e,0x04 + +# GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x4e,0xd5,0x01,0x05,0x0e,0x04] +0x05,0x78,0x4e,0xd5,0x01,0x05,0x0e,0x04 + # GFX10: v_alignbyte_b32 v255, v1, v2, v3 ; encoding: [0xff,0x00,0x4f,0xd5,0x01,0x05,0x0e,0x04] 0xff,0x00,0x4f,0xd5,0x01,0x05,0x0e,0x04 @@ -1233,6 +1245,18 @@ # GFX10: v_alignbyte_b32 v5, vcc_lo, v2, v3 ; encoding: [0x05,0x00,0x4f,0xd5,0x6a,0x04,0x0e,0x04] 0x05,0x00,0x4f,0xd5,0x6a,0x04,0x0e,0x04 +# GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x4f,0xd5,0x01,0x05,0x0e,0x04] +0x05,0x08,0x4f,0xd5,0x01,0x05,0x0e,0x04 + +# GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0x4f,0xd5,0x01,0x05,0x0e,0x04] +0x05,0x18,0x4f,0xd5,0x01,0x05,0x0e,0x04 + +# GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0x4f,0xd5,0x01,0x05,0x0e,0x04] +0x05,0x38,0x4f,0xd5,0x01,0x05,0x0e,0x04 + +# GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x4f,0xd5,0x01,0x05,0x0e,0x04] +0x05,0x78,0x4f,0xd5,0x01,0x05,0x0e,0x04 + # GFX10: v_and_b32_e64 v255, v1, v2 ; encoding: [0xff,0x00,0x1b,0xd5,0x01,0x05,0x02,0x00] 0xff,0x00,0x1b,0xd5,0x01,0x05,0x02,0x00 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3.txt index 618e081525414..802d6368507e2 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3.txt @@ -11310,6 +11310,18 @@ # CHECK: v_alignbit_b32 v5, v1, v2, exec_hi ; encoding: [0x05,0x00,0xce,0xd1,0x01,0x05,0xfe,0x01] 0x05,0x00,0xce,0xd1,0x01,0x05,0xfe,0x01 +# CHECK: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0xce,0xd1,0x01,0x05,0x0e,0x04] +0x05,0x08,0xce,0xd1,0x01,0x05,0x0e,0x04 + +# CHECK: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0xce,0xd1,0x01,0x05,0x0e,0x04] +0x05,0x18,0xce,0xd1,0x01,0x05,0x0e,0x04 + +# CHECK: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0xce,0xd1,0x01,0x05,0x0e,0x04] +0x05,0x38,0xce,0xd1,0x01,0x05,0x0e,0x04 + +# CHECK: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0xce,0xd1,0x01,0x05,0x0e,0x04] +0x05,0x78,0xce,0xd1,0x01,0x05,0x0e,0x04 + # CHECK: v_alignbyte_b32 v5, v1, v2, v3 ; encoding: [0x05,0x00,0xcf,0xd1,0x01,0x05,0x0e,0x04] 0x05,0x00,0xcf,0xd1,0x01,0x05,0x0e,0x04 @@ -11406,6 +11418,18 @@ # CHECK: v_alignbyte_b32 v5, v1, v2, exec_hi ; encoding: [0x05,0x00,0xcf,0xd1,0x01,0x05,0xfe,0x01] 0x05,0x00,0xcf,0xd1,0x01,0x05,0xfe,0x01 +# CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0xcf,0xd1,0x01,0x05,0x0e,0x04] +0x05,0x08,0xcf,0xd1,0x01,0x05,0x0e,0x04 + +# CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0xcf,0xd1,0x01,0x05,0x0e,0x04] +0x05,0x18,0xcf,0xd1,0x01,0x05,0x0e,0x04 + +# CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0xcf,0xd1,0x01,0x05,0x0e,0x04] +0x05,0x38,0xcf,0xd1,0x01,0x05,0x0e,0x04 + +# CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0xcf,0xd1,0x01,0x05,0x0e,0x04] +0x05,0x78,0xcf,0xd1,0x01,0x05,0x0e,0x04 + # CHECK: v_min3_f32 v5, v1, v2, v3 ; encoding: [0x05,0x00,0xd0,0xd1,0x01,0x05,0x0e,0x04] 0x05,0x00,0xd0,0xd1,0x01,0x05,0x0e,0x04 From b86ee20df31acb88e910d2a1aec4f3d863676e46 Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Wed, 16 Jul 2025 23:27:52 -0700 Subject: [PATCH 2/2] Run line for gfx9 --- .../CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll index a4c25a593777b..1ec4f250f8726 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s @@ -20,6 +21,18 @@ define amdgpu_kernel void @v_alignbyte_b32(ptr addrspace(1) %out, i32 %src1, i32 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; +; GFX9-LABEL: v_alignbyte_b32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_alignbyte_b32 v1, s0, v1, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: s_endpgm +; ; GFX10-LABEL: v_alignbyte_b32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 @@ -86,6 +99,23 @@ define amdgpu_kernel void @v_alignbyte_b32_2(ptr addrspace(1) %out, ptr addrspac ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; +; GFX9-LABEL: v_alignbyte_b32_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x3c +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_alignbyte_b32 v1, v1, v2, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; ; GFX10-LABEL: v_alignbyte_b32_2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1