diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 7ce8f2c1669d6..faf2e86145179 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -434,9 +434,11 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", "nc", "fp8-conversion- //===----------------------------------------------------------------------===// // GFX950 only builtins. //===----------------------------------------------------------------------===// +TARGET_BUILTIN(__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4, "V4fV8ZiV8ZiV4fIiIiIiiIii", "nc", "gfx950-insts") +TARGET_BUILTIN(__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4, "V16fV8ZiV8ZiV16fIiIiIiiIii", "nc", "gfx950-insts") + TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x32_f16, "V4fV8hV8hV4fIiIiIi", "nc", "gfx950-insts") TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_f16, "V16fV8hV8hV16fIiIiIi", "nc", "gfx950-insts") - TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_bf16, "V16fV8yV8yV16fIiIiIi", "nc", "gfx950-insts") //===----------------------------------------------------------------------===// diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 887e0f0e05469..b09c21a0cfcb5 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -19729,7 +19729,20 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, (uint64_t)0); return Builder.CreateInsertElement(I0, A, 1); } - + case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4: + case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4: { + llvm::FixedVectorType *VT = FixedVectorType::get(Builder.getInt32Ty(), 8); + Function *F = CGM.getIntrinsic( + BuiltinID == AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4 + ? Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4 + : Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4, + {VT, VT}); + + SmallVector Args; + for (unsigned I = 0, N = E->getNumArgs(); I != N; ++I) + Args.push_back(EmitScalarExpr(E->getArg(I))); + return Builder.CreateCall(F, Args); + } case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32: case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32: case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64: diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl index 841d8fcad0fee..ea9bdcdc21162 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl @@ -16,6 +16,7 @@ typedef half v16h __attribute__((ext_vector_type(16))); typedef half v32h __attribute__((ext_vector_type(32))); typedef int v2i __attribute__((ext_vector_type(2))); typedef int v4i __attribute__((ext_vector_type(4))); +typedef int v8i __attribute__((ext_vector_type(8))); typedef int v16i __attribute__((ext_vector_type(16))); typedef int v32i __attribute__((ext_vector_type(32))); typedef short v2s __attribute__((ext_vector_type(2))); @@ -431,4 +432,18 @@ v16f test_mfma_f32_32x32x16_bf16(v8bf16 a, v8bf16 b, v16f c) { return __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, 1, 2, 3); } +// CHECK-GFX950-LABEL: @test_mfma_scale_f32_16x16x128_f8f6f4 +// CHECK-GFX950: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %a, <8 x i32> %b, <4 x float> %c, i32 3, i32 1, i32 2, i32 %scale_a, i32 3, i32 %scale_b) +void test_mfma_scale_f32_16x16x128_f8f6f4(global v4f* out, v8i a, v8i b, v4f c, int scale_a, int scale_b) +{ + *out = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a, b, c, 3, 1, 2, scale_a, 3, scale_b); +} + +// CHECK-GFX950-LABEL: @test_mfma_scale_f32_32x32x64_f8f6f4 +// CHECK-GFX950: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %a, <8 x i32> %b, <16 x float> %c, i32 3, i32 1, i32 2, i32 %scale_a, i32 3, i32 %scale_b) +void test_mfma_scale_f32_32x32x64_f8f6f4(global v16f* out, v8i a, v8i b, v16f c, int scale_a, int scale_b) +{ + *out = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, b, c, 3, 1, 2, scale_a, 3, scale_b); +} + #endif diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl index 4af67763c40dd..7f0300ec196e3 100644 --- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl @@ -5,6 +5,7 @@ typedef float float4 __attribute__((ext_vector_type(4))); typedef float float16 __attribute__((ext_vector_type(16))); typedef half half8 __attribute__((ext_vector_type(8))); typedef __bf16 bfloat8 __attribute__((ext_vector_type(8))); +typedef int int8 __attribute__((ext_vector_type(8))); void test_mfma_f32_16x16x32_f16(__global float4* out, half8 a, half8 b, float4 c, int X) { @@ -26,3 +27,17 @@ void test_mfma_f32_32x32x16_bf16(__global float16* out, bfloat8 a, bfloat8 b, fl *out = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, 0, X, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_bf16' must be a constant integer}} *out = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, 0, 0, X); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_bf16' must be a constant integer}} } + +void test_mfma_scale_f32_16x16x128_f8f6f4(__global float4* out, int8 a, int8 b, float4 c, int X, int Y) { + *out = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a, b, c, X, 0, 1, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' must be a constant integer}} + *out = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a, b, c, 0, X, 1, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' must be a constant integer}} + *out = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a, b, c, 0, 0, X, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' must be a constant integer}} + *out = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a, b, c, 0, 0, 0, Y, X, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' must be a constant integer}} +} + +void test_mfma_scale_f32_32x32x64_f8f6f4(__global float16* out, int8 a, int8 b, float16 c, int X, int Y) { + *out = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, b, c, X, 0, 1, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' must be a constant integer}} + *out = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, b, c, 0, X, 1, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' must be a constant integer}} + *out = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, b, c, 0, 0, X, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' must be a constant integer}} + *out = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, b, c, 0, 0, 0, Y, X, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' must be a constant integer}} +} diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl index e0fd2aa5c58a0..2a90bda1a5b16 100644 --- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl @@ -4,12 +4,33 @@ typedef float float4 __attribute__((ext_vector_type(4))); typedef float float16 __attribute__((ext_vector_type(16))); typedef half half8 __attribute__((ext_vector_type(8))); +typedef half half16 __attribute__((ext_vector_type(16))); typedef __bf16 bfloat8 __attribute__((ext_vector_type(8))); +typedef __bf16 bfloat16 __attribute__((ext_vector_type(16))); +typedef unsigned int uint2 __attribute__((ext_vector_type(2))); +typedef int int4 __attribute__((ext_vector_type(4))); +typedef int int8 __attribute__((ext_vector_type(8))); +typedef int int16 __attribute__((ext_vector_type(16))); void test(__global float4* out0, half8 a0, half8 b0, float4 c0, __global float16* out1, half8 a1, half8 b1, float16 c1, - __global float16* out2, bfloat8 a2, bfloat8 b2, float16 c2) { + __global float16* out2, bfloat8 a2, bfloat8 b2, float16 c2, + __global int4* out3, int4 a3, int4 b3, int4 c3, + __global int16* out4, int4 a4, int4 b4, int16 c4, + __global float4* out5, bfloat8 a5, bfloat8 b5, float4 c5, + __global float4* out6, half8 a6, half16 b6, float4 c6, + __global float16* out7, half8 a7, half16 b7, float16 c7, + __global float4* out8, bfloat8 a8, bfloat16 b8, float4 c8, + __global float16* out9, bfloat8 a9, bfloat16 b9, float16 c9, + __global int4* out10, int4 a10, int8 b10, int4 c10, + __global int16* out11, int4 a11, int8 b11, int16 c11, + __global float4* out12, int4 a12, int8 b12, float4 c12, + __global float16* out13, int4 a13, int8 b13, float16 c13, + __global float4* out14, int8 a14, int8 b14, float4 c14, int d14, int e14, + __global float16* out15, int8 a15, int8 b15, float16 c15, int d15, int e15) { *out0 = __builtin_amdgcn_mfma_f32_16x16x32_f16(a0, b0, c0, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_16x16x32_f16' needs target feature gfx950-insts}} *out1 = __builtin_amdgcn_mfma_f32_32x32x16_f16(a1, b1, c1, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_32x32x16_f16' needs target feature gfx950-insts}} *out2 = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a2, b2, c2, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_32x32x16_bf16' needs target feature gfx950-insts}} + *out14 = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a14, b14, c14, 0, 0, 0, d14, 0, e14); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' needs target feature gfx950-insts}} + *out15 = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a15, b15, c15, 0, 0, 0, d15, 0, e15); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' needs target feature gfx950-insts}} } diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index a25b6feddbedd..161363e0dd6bc 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -1397,6 +1397,16 @@ The AMDGPU backend implements the following LLVM IR intrinsics. used by hardware to control active lanes when used in EXEC register. For example, ballot(i1 true) return EXEC mask. + llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4 Emit `v_mfma_scale_f32_16x16x128_f8f6f4` to set the scale factor. The + last 4 operands correspond to the scale inputs. + + - 2-bit byte index to use for each lane for matrix A + - Matrix A scale values + - 2-bit byte index to use for each lane for matrix B + - Matrix B scale values + + llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4 Emit `v_mfma_scale_f32_32x32x64_f8f6f4` + ============================================== ========================================================== .. TODO:: diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 360af786c5160..3a5fc86183ca0 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2968,6 +2968,35 @@ class AMDGPUMfmaIntrinsic : [IntrConvergent, IntrNoMem, ImmArg>, ImmArg>, ImmArg>]>; + +// srcA's format is determined by cbsz. srcB's format is determined by +// blgp. +// +// These should be <8 x i32> for f8 formats, <6 x i32> for f6 formats, +// and <4 x i32> for f4 formats. If the format control bits imply a +// smaller type than used, the high elements will be truncated. +// +// If the format control bits imply a larger type than used, the high +// elements are padded with undef. + +class AMDGPUMfmaScaleIntrinsic : + DefaultAttrsIntrinsic<[DestTy], + [llvm_anyvector_ty, llvm_anyvector_ty, DestTy, + llvm_i32_ty, // cbsz + llvm_i32_ty, // blgp + // llvm_i1_ty, // TODO: neg_src2 + // llvm_i1_ty, // TODO: abs_src2 + // llvm_i1_ty, // TODO: clamp + llvm_i32_ty, // op_sel (A matrix scale, 2-bits) // TODO: Make i2? + llvm_i32_ty, // v_mfma_ld_scale_b32 src0 (A matrix scale) + llvm_i32_ty, // op_sel (B matrix scale, 2-bits) // TODO: Make i2? + llvm_i32_ty // v_mfma_ld_scale_b32 src1 (B matrix scale) + ], + [IntrConvergent, IntrNoMem, + ImmArg>, ImmArg>, + ImmArg>, ImmArg> + ]>; + defset list AMDGPUMFMAIntrinsics908 = { def int_amdgcn_mfma_f32_32x32x1f32 : AMDGPUMfmaIntrinsic; def int_amdgcn_mfma_f32_16x16x1f32 : AMDGPUMfmaIntrinsic; @@ -3119,6 +3148,8 @@ def int_amdgcn_mfma_f32_16x16x32_f16 : AMDGPUMfmaIntrinsic; def int_amdgcn_mfma_f32_32x32x16_bf16 : AMDGPUMfmaIntrinsic; +def int_amdgcn_mfma_scale_f32_16x16x128_f8f6f4 : AMDGPUMfmaScaleIntrinsic; +def int_amdgcn_mfma_scale_f32_32x32x64_f8f6f4 : AMDGPUMfmaScaleIntrinsic; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index d348f489d95dd..88fa96bd049f2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -423,3 +423,6 @@ def gi_fp_pow2_to_exponent : GICustomOperandRenderer<"renderFPPow2ToExponent">, def gi_as_hw_round_mode : GICustomOperandRenderer<"renderRoundMode">, GISDNodeXFormEquiv; + +def gi_MFMALdScaleModifierOp : GICustomOperandRenderer<"renderScaledMAIIntrinsicOperand">, + GISDNodeXFormEquiv; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 28d215e7b3de9..30b9e3ea11ef4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -1258,6 +1258,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { if (isa(Src)) { return IC.replaceInstUsesWith(II, Src); } + return std::nullopt; } } if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index f2fe3befd04d5..f7cba2f13c77a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -5742,6 +5742,18 @@ void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB, MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4); } +/// Convert from 2-bit value to enum values used for op_sel* source modifiers. +void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand( + MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { + unsigned Val = MI.getOperand(OpIdx).getImm(); + unsigned New = 0; + if (Val & 0x1) + New |= SISrcMods::OP_SEL_0; + if (Val & 0x2) + New |= SISrcMods::OP_SEL_1; + MIB.addImm(New); +} + bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const { return TII.isInlineConstant(Imm); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 42343104812b6..563e40267f04b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -364,6 +364,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector { void renderRoundMode(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const; + void renderScaledMAIIntrinsicOperand(MachineInstrBuilder &MIB, + const MachineInstr &MI, int OpIdx) const; bool isInlineImmediate(const APInt &Imm) const; bool isInlineImmediate(const APFloat &Imm) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index 671070c70f0c4..6a5065cd4a0e8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -40,7 +40,7 @@ class AMDGPUInst SoftFail = 0; + field bits<128> SoftFail = 0; // FIXME: If this is smaller than largest instruction, DecodeEmitter crashes let DecoderNamespace = Namespace; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index b648b68f3bd2b..7aae9194b5cd0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4769,6 +4769,25 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); break; } + case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4: + case Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4: { + const SIMachineFunctionInfo *Info = MF.getInfo(); + OpdsMapping[0] = + Info->mayNeedAGPRs() + ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI) + : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); + + OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); + OpdsMapping[4] = + Info->mayNeedAGPRs() + ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) + : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); + + OpdsMapping[8] = getVGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI); + OpdsMapping[10] = getVGPROpMapping(MI.getOperand(10).getReg(), MRI, *TRI); + break; + } case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16: case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16: case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16: diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 06df08feda8fa..f90121a86c846 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -512,6 +512,17 @@ static inline DecoderUInt128 eat12Bytes(ArrayRef &Bytes) { return DecoderUInt128(Lo, Hi); } +static inline DecoderUInt128 eat16Bytes(ArrayRef &Bytes) { + assert(Bytes.size() >= 16); + uint64_t Lo = + support::endian::read(Bytes.data()); + Bytes = Bytes.slice(8); + uint64_t Hi = + support::endian::read(Bytes.data()); + Bytes = Bytes.slice(8); + return DecoderUInt128(Lo, Hi); +} + DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, ArrayRef Bytes_, uint64_t Address, @@ -548,6 +559,15 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, // Reinitialize Bytes Bytes = Bytes_.slice(0, MaxInstBytesNum); + + } else if (Bytes.size() >= 16 && + STI.hasFeature(AMDGPU::FeatureGFX950Insts)) { + DecoderUInt128 DecW = eat16Bytes(Bytes); + if (tryDecodeInst(DecoderTableGFX940128, MI, DecW, Address, CS)) + break; + + // Reinitialize Bytes + Bytes = Bytes_.slice(0, MaxInstBytesNum); } if (Bytes.size() >= 8) { @@ -759,6 +779,9 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::SDWA) convertSDWAInst(MI); + if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::IsMAI) + convertMAIInst(MI); + int VDstIn_Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in); if (VDstIn_Idx != -1) { @@ -837,6 +860,58 @@ void AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const { } } +/// Adjust the register values used by V_MFMA_F8F6F4_f8_f8 instructions to the +/// appropriate subregister for the used format width. +static void adjustMFMA_F8F6F4OpRegClass(const MCRegisterInfo &MRI, + MCOperand &MO, uint8_t NumRegs) { + switch (NumRegs) { + case 4: + return MO.setReg(MRI.getSubReg(MO.getReg(), AMDGPU::sub0_sub1_sub2_sub3)); + case 6: + return MO.setReg( + MRI.getSubReg(MO.getReg(), AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5)); + case 8: + // No-op in cases where one operand is still f8/bf8. + return; + default: + llvm_unreachable("Unexpected size for mfma f8f6f4 operand"); + } +} + +/// f8f6f4 instructions have different pseudos depending on the used formats. In +/// the disassembler table, we only have the variants with the largest register +/// classes which assume using an fp8/bf8 format for both operands. The actual +/// register class depends on the format in blgp and cbsz operands. Adjust the +/// register classes depending on the used format. +void AMDGPUDisassembler::convertMAIInst(MCInst &MI) const { + int BlgpIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::blgp); + if (BlgpIdx == -1) + return; + + int CbszIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::cbsz); + + unsigned CBSZ = MI.getOperand(CbszIdx).getImm(); + unsigned BLGP = MI.getOperand(BlgpIdx).getImm(); + + const AMDGPU::MFMA_F8F6F4_Info *AdjustedRegClassOpcode = + AMDGPU::getMFMA_F8F6F4_WithFormatArgs(CBSZ, BLGP, MI.getOpcode()); + if (!AdjustedRegClassOpcode || + AdjustedRegClassOpcode->Opcode == MI.getOpcode()) + return; + + MI.setOpcode(AdjustedRegClassOpcode->Opcode); + int Src0Idx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); + int Src1Idx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1); + adjustMFMA_F8F6F4OpRegClass(MRI, MI.getOperand(Src0Idx), + AdjustedRegClassOpcode->NumRegsSrcA); + adjustMFMA_F8F6F4OpRegClass(MRI, MI.getOperand(Src1Idx), + AdjustedRegClassOpcode->NumRegsSrcB); +} + struct VOPModifiers { unsigned OpSel = 0; unsigned OpSelHi = 0; diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index 694cd7a9bfd28..3e20a2ab9e66c 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -204,6 +204,7 @@ class AMDGPUDisassembler : public MCDisassembler { void convertVINTERPInst(MCInst &MI) const; void convertFMAanyK(MCInst &MI, int ImmLitIdx) const; void convertSDWAInst(MCInst &MI) const; + void convertMAIInst(MCInst &MI) const; void convertDPP8Inst(MCInst &MI) const; void convertMIMGInst(MCInst &MI) const; void convertVOP3DPPInst(MCInst &MI) const; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp index 3c9f6d2938075..56ed29ede02c2 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp @@ -59,6 +59,10 @@ unsigned AMDGPUMCAsmInfo::getMaxInstLength(const MCSubtargetInfo *STI) const { if (STI->hasFeature(AMDGPU::FeatureNSAEncoding)) return 20; + // VOP3PX encoding. + if (STI->hasFeature(AMDGPU::FeatureGFX950Insts)) + return 16; + // 64-bit instruction with 32-bit literal. if (STI->hasFeature(AMDGPU::FeatureVOP3Literal)) return 12; diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index 8f297726a0df8..f812ae652b63d 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -1048,6 +1048,18 @@ enum Offset_COV5 : unsigned { } // namespace ImplicitArg +namespace MFMAScaleFormats { +// Enum value used in cbsz/blgp for F8F6F4 MFMA operations to select the matrix +// format. +enum MFMAScaleFormats { + FP8_E4M3 = 0, + FP8_E5M2 = 1, + FP6_E2M3 = 2, + FP6_E3M2 = 3, + FP4_E2M1 = 4 +}; +} // namespace MFMAScaleFormats + namespace VirtRegFlag { // Virtual register flags used for various target specific handlings during // codegen. diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 8c8d55bbc0131..1406938592b2c 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -15454,6 +15454,23 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, MRI.setRegClass(Op.getReg(), NewRC); } + if (TII->isMAI(MI)) { + // The ordinary src0, src1, src2 were legalized above. + // + // We have to also legalize the appended v_mfma_ld_scale_b32 operands, + // as a separate instruction. + int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::scale_src0); + if (Src0Idx != -1) { + int Src1Idx = Src0Idx + 2; + assert(Src1Idx = AMDGPU::getNamedOperandIdx( + MI.getOpcode(), AMDGPU::OpName::scale_src1)); + if (TII->usesConstantBus(MRI, MI, Src0Idx) && + TII->usesConstantBus(MRI, MI, Src1Idx)) + TII->legalizeOpWithMove(MI, Src1Idx); + } + } + if (!HasAGPRs) return; diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td index dd1ab2c628715..267c9a94b9096 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td +++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td @@ -300,6 +300,11 @@ class Enc96 { int Size = 12; } +class Enc128 { + field bits<128> Inst; + int Size = 16; +} + def CPolBit { int GLC = 0; int SLC = 1; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 1f7fff76d1521..e55418326a4bd 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1115,6 +1115,12 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { const MachineOperand &MO, const MCOperandInfo &OpInfo) const; + bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineInstr &MI, + int OpIdx) const { + return usesConstantBus(MRI, MI.getOperand(OpIdx), + MI.getDesc().operands()[OpIdx]); + } + /// Return true if this instruction has any modifiers. /// e.g. src[012]_mod, omod, clamp. bool hasModifiers(unsigned Opcode) const; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index d2024cf915874..a6496cd4a61f1 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -914,6 +914,16 @@ def fp16_zeros_high_16bits : PatLeaf<(f16 VGPR_32:$src), [{ return fp16SrcZerosHighBits(N->getOpcode()); }]>; +def MFMALdScaleXForm : SDNodeXFormgetZExtValue(); + unsigned New = 0; + if (Val & 0x1) + New |= SISrcMods::OP_SEL_0; + if (Val & 0x2) + New |= SISrcMods::OP_SEL_1; + return CurDAG->getTargetConstant(New, SDLoc(N), MVT::i32); +}]>; + def is_canonicalized : PatLeaf<(fAny srcvalue:$src), [{ const SITargetLowering &Lowering = *static_cast(getTargetLowering()); @@ -1515,6 +1525,10 @@ class PackedIntInputMods : InputMods < def PackedF16InputMods : PackedFPInputMods; def PackedI16InputMods : PackedIntInputMods; +def MFMALdScaleModifierOp : TImmLeaf(Imm); +}], MFMALdScaleXForm>; + //===----------------------------------------------------------------------===// // Complex patterns //===----------------------------------------------------------------------===// @@ -2655,6 +2669,8 @@ class VOPProfile _ArgVT, bit _EnableClamp = 0> { field string AsmVOPDX = getAsmVOPDPart.ret; field string AsmVOPDY = getAsmVOPDPart.ret; field string TieRegDPP = "$old"; + field bit IsSMFMAC = false; + field bit HasAbid = !and(IsMAI, HasSrc1); } class VOP_NO_EXT : VOPProfile { @@ -2851,6 +2867,31 @@ def VOP_V16F32_V2I32_V4I32_I32 : VOPProfile <[v16f32, v2i32, v4i32, i32]>; def VOP_V4F32_V8F16_V8F16_V4F32 : VOPProfile <[v4f32, v8f16, v8f16, v4f32]>; def VOP_V16F32_V8F16_V8F16_V16F32 : VOPProfile <[v16f32, v8f16, v8f16, v16f32]>; def VOP_V16F32_V8BF16_V8BF16_V16F32 : VOPProfile <[v16f32, v8bf16, v8bf16, v16f32]>; +def VOP_V4F32_V8I32_V8I32_V4F32 : VOPProfile <[v4f32, v8i32, v8i32, v4f32]>; + +def VOP_V4F32_V8I32_V6I32_V4F32 : VOPProfile <[v4f32, v8i32, v6i32, v4f32]>; +def VOP_V4F32_V6I32_V8I32_V4F32 : VOPProfile <[v4f32, v6i32, v8i32, v4f32]>; +def VOP_V4F32_V6I32_V6I32_V4F32 : VOPProfile <[v4f32, v6i32, v6i32, v4f32]>; + +def VOP_V4F32_V8I32_V4I32_V4F32 : VOPProfile <[v4f32, v8i32, v4i32, v4f32]>; +def VOP_V4F32_V4I32_V8I32_V4F32 : VOPProfile <[v4f32, v4i32, v8i32, v4f32]>; +def VOP_V4F32_V6I32_V4I32_V4F32 : VOPProfile <[v4f32, v6i32, v4i32, v4f32]>; +def VOP_V4F32_V4I32_V6I32_V4F32 : VOPProfile <[v4f32, v4i32, v6i32, v4f32]>; +def VOP_V4F32_V4I32_V4I32_V4F32 : VOPProfile <[v4f32, v4i32, v4i32, v4f32]>; + +def VOP_V16F32_V8I32_V8I32_V16F32 : VOPProfile <[v16f32, v8i32, v8i32, v16f32]>; +def VOP_V16F32_V8I32_V6I32_V16F32 : VOPProfile <[v16f32, v8i32, v6i32, v16f32]>; +def VOP_V16F32_V6I32_V8I32_V16F32 : VOPProfile <[v16f32, v6i32, v8i32, v16f32]>; +def VOP_V16F32_V6I32_V6I32_V16F32 : VOPProfile <[v16f32, v6i32, v6i32, v16f32]>; + +def VOP_V16F32_V8I32_V4I32_V16F32 : VOPProfile <[v16f32, v8i32, v4i32, v16f32]>; +def VOP_V16F32_V4I32_V8I32_V16F32 : VOPProfile <[v16f32, v4i32, v8i32, v16f32]>; +def VOP_V16F32_V6I32_V4I32_V16F32 : VOPProfile <[v16f32, v6i32, v4i32, v16f32]>; +def VOP_V16F32_V4I32_V6I32_V16F32 : VOPProfile <[v16f32, v4i32, v6i32, v16f32]>; +def VOP_V16F32_V4I32_V4I32_V16F32 : VOPProfile <[v16f32, v4i32, v4i32, v16f32]>; + +def VOP_V4I32_V4I32_V4I32_V4I32 : VOPProfile <[v4i32, v4i32, v4i32, v4i32]>; +def VOP_V16I32_V4I32_V4I32_V16I32 : VOPProfile <[v16i32, v4i32, v4i32, v16i32]>; class Commutable_REV { @@ -3114,6 +3155,16 @@ def getVCMPXOpFromVCMP : InstrMapping { let ValueCols = [["1"]]; } +// Map encoded mfma(_scale)?_f8f6f4 instructions depending on the +// number of registers required for the used format. +def getMFMA_F8F6F4_WithSize : GenericTable { + let FilterClass = "MFMA_F8F6F4_WithSizeTable"; + let CppTypeName = "MFMA_F8F6F4_Info"; + let Fields = [ "Opcode", "F8F8Opcode", "NumRegsSrcA", "NumRegsSrcB" ]; + let PrimaryKey = [ "NumRegsSrcA", "NumRegsSrcB", "F8F8Opcode" ]; + let PrimaryKeyName = "getMFMA_F8F6F4_InstWithNumRegs" ; +} + def FP8DstByteSelTable : GenericTable { let FilterClass = "VOP3_Pseudo"; let CppTypeName = "FP8DstByteSelInfo"; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 46b2b4a389200..e3baeed01841a 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -1346,11 +1346,14 @@ class AVSrcOperand def AVSrc_32 : AVSrcOperand; def AVSrc_64 : AVSrcOperand; def AVSrc_128 : AVSrcOperand; +def AVSrc_192 : AVSrcOperand; +def AVSrc_256 : AVSrcOperand; class AVDstOperand : AVOperand; def AVDst_128 : AVDstOperand; +def AVDst_256 : AVDstOperand; def AVDst_512 : AVDstOperand; class AVLdStOperand diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 501d00b1f308d..c7e4659b15d29 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -415,6 +415,8 @@ struct FP8DstByteSelInfo { #define GET_WMMAOpcode2AddrMappingTable_IMPL #define GET_WMMAOpcode3AddrMappingTable_DECL #define GET_WMMAOpcode3AddrMappingTable_IMPL +#define GET_getMFMA_F8F6F4_WithSize_DECL +#define GET_getMFMA_F8F6F4_WithSize_IMPL #include "AMDGPUGenSearchableTables.inc" int getMTBUFBaseOpcode(unsigned Opc) { @@ -523,6 +525,30 @@ bool getMAIIsGFX940XDL(unsigned Opc) { return Info ? Info->is_gfx940_xdl : false; } +static uint8_t mfmaScaleF8F6F4FormatToNumRegs(unsigned EncodingVal) { + switch (EncodingVal) { + case MFMAScaleFormats::FP6_E2M3: + case MFMAScaleFormats::FP6_E3M2: + return 6; + case MFMAScaleFormats::FP4_E2M1: + return 4; + case MFMAScaleFormats::FP8_E4M3: + case MFMAScaleFormats::FP8_E5M2: + default: + return 8; + } + + llvm_unreachable("covered switch over mfma scale formats"); +} + +const MFMA_F8F6F4_Info *getMFMA_F8F6F4_WithFormatArgs(unsigned CBSZ, + unsigned BLGP, + unsigned F8F8Opcode) { + uint8_t SrcANumRegs = mfmaScaleF8F6F4FormatToNumRegs(CBSZ); + uint8_t SrcBNumRegs = mfmaScaleF8F6F4FormatToNumRegs(BLGP); + return getMFMA_F8F6F4_InstWithNumRegs(SrcANumRegs, SrcBNumRegs, F8F8Opcode); +} + unsigned getVOPDEncodingFamily(const MCSubtargetInfo &ST) { if (ST.hasFeature(AMDGPU::FeatureGFX12Insts)) return SIEncodingFamily::GFX12; @@ -2912,6 +2938,7 @@ const AlwaysUniform *lookupAlwaysUniform(unsigned Intr); #define GET_Gfx9BufferFormat_IMPL #define GET_Gfx10BufferFormat_IMPL #define GET_Gfx11PlusBufferFormat_IMPL + #include "AMDGPUGenSearchableTables.inc" } // end anonymous namespace diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 88a6d75b72c7d..b0581711961b1 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -581,6 +581,18 @@ unsigned getVOPDEncodingFamily(const MCSubtargetInfo &ST); LLVM_READONLY CanBeVOPD getCanBeVOPD(unsigned Opc); +struct MFMA_F8F6F4_Info { + unsigned Opcode; + unsigned F8F8Opcode; + uint8_t NumRegsSrcA; + uint8_t NumRegsSrcB; +}; + +LLVM_READONLY +const MFMA_F8F6F4_Info *getMFMA_F8F6F4_WithFormatArgs(unsigned CBSZ, + unsigned BLGP, + unsigned F8F8Opcode); + LLVM_READONLY const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 876d4e1acf596..a6ec1dba23aad 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -18,6 +18,7 @@ class VOP3P_Profile, VOP3P_LD_SCALE> { @@ -542,19 +543,23 @@ def VOPProfileAccWrite : VOP3P_Profile { } class VOPProfileMAI + RegisterOperand SrcARC = AVSrc_32, RegisterOperand SrcBRC = SrcARC> : VOP3P_Profile { + bit HasAbid = true; let DstRC = _DstRC; - let Src0RC64 = SrcABRC; - let Src1RC64 = SrcABRC; + let Src0RC64 = SrcARC; + let Src1RC64 = SrcBRC; let Src2RC64 = _SrcRC; let HasOpSel = 0; let HasClamp = 0; let HasIntClamp = 0; let HasOMod = 0; let HasModifiers = 0; - let AsmVOP3Base = "$vdst, $src0, $src1, $src2$cbsz$abid$blgp"; - let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, Src2RC64:$src2, CBSZ:$cbsz, ABID:$abid, blgp:$blgp); + let AsmVOP3Base = "$vdst, $src0, $src1, $src2$cbsz"#!if(HasAbid,"$abid","")#"$blgp"; + let Ins64 = !con( + (ins Src0RC64:$src0, Src1RC64:$src1, Src2RC64:$src2, CBSZ:$cbsz), + !if(HasAbid, (ins ABID:$abid), (ins)), + (ins blgp:$blgp)); let InsVOP3Base = Ins64; // Dst and SrcC cannot partially overlap if SrcC/Dst is bigger than 4 VGPRs. // We then create two versions of the instruction: with tied dst and src2 @@ -572,6 +577,7 @@ class VOPProfileSMFMAC; @@ -639,14 +645,112 @@ def VOPProfileMAI_F32_V8F16_X16_VCD : VOPProfileMAI; def VOPProfileMAI_F32_V8BF16_X16_VCD : VOPProfileMAI; + +let HasAbid = false in { +// For f32_16x16x128_f8f6f4 - f8 x f8 case +def VOPProfileMAI_F32_V8I32_V8I32_X128 : VOPProfileMAI; +def VOPProfileMAI_F32_V8I32_V8I32_X128_VCD : VOPProfileMAI; + +// For f32_16x16x128_f8f6f4 - f8 x f6 case +def VOPProfileMAI_F32_V8I32_V6I32_X128 : VOPProfileMAI; +def VOPProfileMAI_F32_V8I32_V6I32_X128_VCD : VOPProfileMAI; + +// For f32_16x16x128_f8f6f4 - f6 x f8 case +def VOPProfileMAI_F32_V6I32_V8I32_X128 : VOPProfileMAI; +def VOPProfileMAI_F32_V6I32_V8I32_X128_VCD : VOPProfileMAI; + +// For f32_16x16x128_f8f6f4 - f6 x f6 case +def VOPProfileMAI_F32_V6I32_V6I32_X128 : VOPProfileMAI; +def VOPProfileMAI_F32_V6I32_V6I32_X128_VCD : VOPProfileMAI; + +// For f32_16x16x128_f8f6f4 - f6 x f4 case +def VOPProfileMAI_F32_V6I32_V4I32_X128 : VOPProfileMAI; +def VOPProfileMAI_F32_V6I32_V4I32_X128_VCD : VOPProfileMAI; + +// For f32_16x16x128_f8f6f4 - f4 x f6 case +def VOPProfileMAI_F32_V4I32_V6I32_X128 : VOPProfileMAI; +def VOPProfileMAI_F32_V4I32_V6I32_X128_VCD : VOPProfileMAI; + +// For f32_16x16x128_f8f6f4 - f8 x f4 case +def VOPProfileMAI_F32_V8I32_V4I32_X128 : VOPProfileMAI; +def VOPProfileMAI_F32_V8I32_V4I32_X128_VCD : VOPProfileMAI; + +// For f32_16x16x128_f8f6f4 - f4 x f8 case +def VOPProfileMAI_F32_V4I32_V8I32_X128 : VOPProfileMAI; +def VOPProfileMAI_F32_V4I32_V8I32_X128_VCD : VOPProfileMAI; + +// For f32_16x16x128_f8f6f4 - f4 x f4 case +def VOPProfileMAI_F32_V4I32_V4I32_X128 : VOPProfileMAI; +def VOPProfileMAI_F32_V4I32_V4I32_X128_VCD : VOPProfileMAI; + +// For f32_32x32x64_f8f6f4 - f8 x f8 case +def VOPProfileMAI_F32_V8I32_V8I32_X512 : VOPProfileMAI; +def VOPProfileMAI_F32_V8I32_V8I32_X512_VCD : VOPProfileMAI; + +// For f32_32x32x64_f8f6f4 - f8 x f6 case +def VOPProfileMAI_F32_V8I32_V6I32_X512 : VOPProfileMAI; +def VOPProfileMAI_F32_V8I32_V6I32_X512_VCD : VOPProfileMAI; + +// For f32_32x32x64_f8f6f4 - f8 x f4 case +def VOPProfileMAI_F32_V8I32_V4I32_X512 : VOPProfileMAI; +def VOPProfileMAI_F32_V8I32_V4I32_X512_VCD : VOPProfileMAI; + +// For f32_32x32x64_f8f6f4 - f4 x f8 case +def VOPProfileMAI_F32_V4I32_V8I32_X512 : VOPProfileMAI; +def VOPProfileMAI_F32_V4I32_V8I32_X512_VCD : VOPProfileMAI; + +// For f32_32x32x64_f8f6f4 - f6 x f8 case +def VOPProfileMAI_F32_V6I32_V8I32_X512 : VOPProfileMAI; +def VOPProfileMAI_F32_V6I32_V8I32_X512_VCD : VOPProfileMAI; + +// For f32_32x32x64_f8f6f4 - f6 x f6 case +def VOPProfileMAI_F32_V6I32_V6I32_X512 : VOPProfileMAI; +def VOPProfileMAI_F32_V6I32_V6I32_X512_VCD : VOPProfileMAI; + +// For f32_32x32x64_f8f6f4 - f6 x f4 case +def VOPProfileMAI_F32_V6I32_V4I32_X512 : VOPProfileMAI; +def VOPProfileMAI_F32_V6I32_V4I32_X512_VCD : VOPProfileMAI; + +// For f32_32x32x64_f8f6f4 - f4 x f6 case +def VOPProfileMAI_F32_V4I32_V6I32_X512 : VOPProfileMAI; +def VOPProfileMAI_F32_V4I32_V6I32_X512_VCD : VOPProfileMAI; + +// For f32_32x32x64_f8f6f4 - f4 x f4 case +def VOPProfileMAI_F32_V4I32_V4I32_X512 : VOPProfileMAI; +def VOPProfileMAI_F32_V4I32_V4I32_X512_VCD : VOPProfileMAI; +} + + class MFMATable { bit IsMac = is_mac; string FMAOp = Name; } -class MAIFrag : PatFrag < - (ops node:$src0, node:$src1, node:$src2, node:$cbsz, node:$abid, node:$blgp), - (Op $src0, $src1, $src2, $cbsz, $abid, $blgp), +class MFMA_F8F6F4_WithSizeTable { + Instruction F8F8Opcode = F8F8Variant; + Instruction Opcode = ThisVariant; + bits<8> NumRegsSrcA = A; + bits<8> NumRegsSrcB = B; +} + +class MFMA_F8F6F4_WithSizeTable_Helper : + MFMA_F8F6F4_WithSizeTable(NAME), + !cast(F8F8Op)> { +} + +// Currently assumes scaled instructions never have abid +class MAIFrag : PatFrag < + !if(Scaled, (ops node:$src0, node:$src1, node:$src2, node:$cbsz, node:$blgp, + node:$scale_src0_opsel, node:$scale_src0, + node:$scale_src1_opsel, node:$scale_src1), + !con((ops node:$src0, node:$src1, node:$src2, node:$cbsz), + !if(HasAbid, (ops node:$abid), (ops)), + (ops node:$blgp))), + !if(Scaled, (Op $src0, $src1, $src2, $cbsz, $blgp, $scale_src0_opsel, $scale_src0, $scale_src1_opsel, $scale_src1), + !if(HasAbid, (Op $src0, $src1, $src2, $cbsz, $abid, $blgp), + (Op $src0, $src1, $src2, $cbsz, $blgp))), pred >; @@ -666,11 +770,15 @@ defvar MayNotNeedAGPRs_gisel = [{ return !MF.getInfo()->mayNeedAGPRs(); }]; -class AgprMAIFrag : MAIFrag { +class AgprMAIFrag : + MAIFrag { let GISelPredicateCode = MayNeedAGPRs_gisel; } -class VgprMAIFrag : MAIFrag { +class VgprMAIFrag : + MAIFrag { let GISelPredicateCode = MayNotNeedAGPRs_gisel; } @@ -681,27 +789,51 @@ let isAsCheapAsAMove = 1, isReMaterializable = 1 in { } // End isMoveImm = 1 } // End isAsCheapAsAMove = 1, isReMaterializable = 1 -class MAIInst - : VOP3InstBase { +class MAIInst + : VOP3InstBase { let SubtargetPredicate = HasMAIInsts; Instruction Opcode = !cast(NAME); bit is_dgemm = 0; bit is_gfx940_xdl = 0; + let PseudoInstr = NAME; // FIXME: Why is this not the default } -multiclass MAIInst { +// FIXME: Intrinsic should probably not have op_sel operands, we can +// pattern match byte select patterns into op_sel. +// FIXME: Missing neg and clamp modifiers +// +// FIXME: Usual syntax for op_sel is quite hostile here. +class ScaledMAIInst : + MAIInst { + // Append operands from V_MFMA_LD_SCALE_B32, but we need to rename them. + let InOperandList = !con(BaseInst.InOperandList, + (ins VSrc_b32:$scale_src0, + VSrc_b32:$scale_src1, + op_sel0:$scale_src0_opsel, + op_sel_hi0:$scale_src1_opsel)); + let AsmOperands = + "$vdst, $src0, $src1, $src2, $scale_src0, $scale_src1" + "$scale_src0_opsel$scale_src1_opsel$cbsz$blgp"; + + let FixedSize = 1; + let Size = 16; +} + +multiclass MAIInst { defvar NoDstOverlap = !cast("VOPProfileMAI_" # P).NoDstOverlap; let isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in { // FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported. let Constraints = !if(NoDstOverlap, "@earlyclobber $vdst", "") in { def _e64 : MAIInst("VOPProfileMAI_" # P), - !if(!or(NoDstOverlap, !eq(node, null_frag)), null_frag, AgprMAIFrag)>, + !if(!or(NoDstOverlap, !eq(node, null_frag)), null_frag, AgprMAIFrag), Scaled>, MFMATable<0, NAME # "_e64">; let OtherPredicates = [isGFX90APlus], Mnemonic = OpName in def _vgprcd_e64 : MAIInst("VOPProfileMAI_" # P # "_VCD"), - !if(!or(NoDstOverlap, !eq(node, null_frag)), null_frag, VgprMAIFrag)>, + !if(!or(NoDstOverlap, !eq(node, null_frag)), null_frag, VgprMAIFrag), Scaled>, MFMATable<0, NAME # "_vgprcd_e64">; } @@ -710,18 +842,77 @@ multiclass MAIInst { isConvertibleToThreeAddress = NoDstOverlap, Mnemonic = OpName in { def "_mac_e64" : MAIInst("VOPProfileMAI_" # P), - !if(!eq(node, null_frag), null_frag, AgprMAIFrag)>, + !if(!eq(node, null_frag), null_frag, AgprMAIFrag), Scaled>, MFMATable<1, NAME # "_e64">; let OtherPredicates = [isGFX90APlus] in def _mac_vgprcd_e64 : MAIInst("VOPProfileMAI_" # P # "_VCD"), - !if(!eq(node, null_frag), null_frag, VgprMAIFrag)>, + !if(!eq(node, null_frag), null_frag, VgprMAIFrag), Scaled>, MFMATable<1, NAME # "_vgprcd_e64">; } } } // End isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 } +// Provide a wrapper around MAIInst that provides the appended operands from V_MFMA_LD_SCALE_B32 +multiclass ScaledMAIInst_mc { + defvar VariantSuffix = !subst(!toupper(OpName), "", NAME); // Drop the main opcode name prefix to get the "_fN_fM" suffix. + defvar UnscaledOpName = UnscaledOpName_#VariantSuffix; + + defvar HasAbid = false; + + defvar NoDstOverlap = !cast(!cast(UnscaledOpName#"_e64").Pfl).NoDstOverlap; + + def _e64 : ScaledMAIInst(UnscaledOpName#"_e64"), !if(NoDstOverlap, null_frag, AgprMAIFrag)>, + MFMATable<0, NAME # "_e64">; + + def _vgprcd_e64 : ScaledMAIInst(UnscaledOpName#"_vgprcd_e64"), !if(NoDstOverlap, null_frag, VgprMAIFrag)>, + MFMATable<0, NAME # "_vgprcd_e64">; + + if NoDstOverlap then { + let Constraints = !if(NoDstOverlap, "$vdst = $src2", ""), + isConvertibleToThreeAddress = NoDstOverlap, + Mnemonic = UnscaledOpName_ in { + def _mac_e64 : ScaledMAIInst(UnscaledOpName # "_mac_e64"), AgprMAIFrag>, + MFMATable<1, NAME # "_e64">; + + def _mac_vgprcd_e64 : ScaledMAIInst(UnscaledOpName # "_mac_vgprcd_e64"), VgprMAIFrag>, + MFMATable<1, NAME # "_vgprcd_e64">; + } + } +} + +// Each of SrcA and SrcB can be encoded using 3 different sizes, so +// define 9 permutations of register classes. +multiclass MAIInst_SrcFormats_mc { + defvar HasAbid = false; + defm _f8_f8 : MAIInst; + defm _f8_f6 : MAIInst; + defm _f6_f8 : MAIInst; + defm _f6_f6 : MAIInst; + defm _f8_f4 : MAIInst; + defm _f4_f8 : MAIInst; + defm _f6_f4 : MAIInst; + defm _f4_f6 : MAIInst; + defm _f4_f4 : MAIInst; +} + +multiclass MAIInst_SrcFormats_Scaled_mc { + defm _f8_f8 : ScaledMAIInst_mc; + defm _f8_f6 : ScaledMAIInst_mc; + defm _f6_f8 : ScaledMAIInst_mc; + defm _f6_f6 : ScaledMAIInst_mc; + defm _f8_f4 : ScaledMAIInst_mc; + defm _f4_f8 : ScaledMAIInst_mc; + defm _f6_f4 : ScaledMAIInst_mc; + defm _f4_f6 : ScaledMAIInst_mc; + defm _f4_f4 : ScaledMAIInst_mc; +} + defm V_MFMA_F32_4X4X1F32 : MAIInst<"v_mfma_f32_4x4x1f32", "F32_F32_X4", int_amdgcn_mfma_f32_4x4x1f32>; defm V_MFMA_F32_16X16X1F32 : MAIInst<"v_mfma_f32_16x16x1f32", "F32_F32_X16", int_amdgcn_mfma_f32_16x16x1f32>; defm V_MFMA_F32_16X16X4F32 : MAIInst<"v_mfma_f32_16x16x4f32", "F32_F32_X4", int_amdgcn_mfma_f32_16x16x4f32>; @@ -753,6 +944,20 @@ let SubtargetPredicate = HasGFX950Insts, is_gfx940_xdl = 1 in { defm V_MFMA_F32_16X16X32_F16 : MAIInst<"v_mfma_f32_16x16x32f16", "F32_V8F16_X32", int_amdgcn_mfma_f32_16x16x32_f16>; defm V_MFMA_F32_32X32X16_F16 : MAIInst<"v_mfma_f32_32x32x16f16", "F32_V8F16_X16", int_amdgcn_mfma_f32_32x32x16_f16>; defm V_MFMA_F32_32X32X16_BF16 : MAIInst<"v_mfma_f32_32x32x16bf16", "F32_V8BF16_X16", int_amdgcn_mfma_f32_32x32x16_bf16>; + +defm V_MFMA_F32_16X16X128_F8F6F4 : MAIInst_SrcFormats_mc<"v_mfma_f32_16x16x128f8f6f4", + "_X128">; +defm V_MFMA_F32_32X32X64_F8F6F4 : MAIInst_SrcFormats_mc<"v_mfma_f32_32x32x64f8f6f4", + "_X512">; + +defm V_MFMA_SCALE_F32_16X16X128_F8F6F4 : MAIInst_SrcFormats_Scaled_mc< + "v_mfma_scale_f32_16x16x128_f8f6f4", "V_MFMA_F32_16X16X128_F8F6F4", + int_amdgcn_mfma_scale_f32_16x16x128_f8f6f4>; + +defm V_MFMA_SCALE_F32_32X32X64_F8F6F4 : MAIInst_SrcFormats_Scaled_mc< + "v_mfma_scale_f32_32x32x64_f8f6f4", + "V_MFMA_F32_32X32X64_F8F6F4", + int_amdgcn_mfma_scale_f32_32x32x64_f8f6f4>; } let SubtargetPredicate = HasGFX950Insts in { @@ -869,7 +1074,6 @@ def VOP_V4F32_V16F16_V16F16_V4F32 : VOPProfile <[v4f32, v16f16, v16f16, v4f32]>; def VOP_V4F32_V16I16_V16I16_V4F32 : VOPProfile <[v4f32, v16i16, v16i16, v4f32]>; def VOP_V8F16_V16F16_V16F16_V8F16 : VOPProfile <[v8f16, v16f16, v16f16, v8f16]>; def VOP_V8I16_V16I16_V16I16_V8I16 : VOPProfile <[v8i16, v16i16, v16i16, v8i16]>; -def VOP_V4I32_V4I32_V4I32_V4I32 : VOPProfile <[v4i32, v4i32, v4i32, v4i32]>; def VOP_V4I32_V2I32_V2I32_V4I32 : VOPProfile <[v4i32, v2i32, v2i32, v4i32]>; @@ -1676,6 +1880,26 @@ multiclass VOP3P_Real_MFMA_gfx940 op, string Name = !cast(N } } +multiclass VOP3P_Real_MFMA_F8F6F4_gfx940 op, string Name = !cast(NAME#"_e64").Mnemonic, + VOP3_Pseudo PS_ACD = !cast(NAME # "_e64"), + VOP3_Pseudo PS_VCD = !cast(NAME # "_vgprcd" # "_e64")> { + + defvar F8F8Name = !substr(NAME, 0, !sub(!size(NAME), !size("_fN_fM")))#"_f8_f8"; + + let AssemblerPredicate = isGFX940Plus, + DecoderNamespace = "GFX940", + AsmString = Name # PS_ACD.AsmOperands, + Constraints = "" in { + def _gfx940_acd : VOP3P_Real, + VOP3Pe_MAI , + MFMA_F8F6F4_WithSizeTable_Helper; + + def _gfx940_vcd : VOP3P_Real, + VOP3Pe_MAI , + MFMA_F8F6F4_WithSizeTable_Helper; + } // End AssemblerPredicate = isGFX940Plus, DecoderNamespace = "GFX940" +} + multiclass VOP3P_Real_MFMA_gfx950 op, string Name = !cast(NAME#"_e64").Mnemonic, VOP3_Pseudo PS_ACD = !cast(NAME # "_e64"), VOP3_Pseudo PS_VCD = !cast(NAME # "_vgprcd" # "_e64")> { @@ -1686,6 +1910,55 @@ multiclass VOP3P_Real_MFMA_gfx950 op, string Name = !cast(N } +multiclass VOP3P_Real_MFMA_F8F6F4_gfx950_mc op, string Name> { + defm _f8_f8 : VOP3P_Real_MFMA_F8F6F4_gfx940; + + let isAsmParserOnly = true in { // Disable ambiguous disassembly. + defm _f8_f6 : VOP3P_Real_MFMA_F8F6F4_gfx940; + defm _f6_f8 : VOP3P_Real_MFMA_F8F6F4_gfx940; + defm _f8_f4 : VOP3P_Real_MFMA_F8F6F4_gfx940; + defm _f4_f8 : VOP3P_Real_MFMA_F8F6F4_gfx940; + defm _f6_f6 : VOP3P_Real_MFMA_F8F6F4_gfx940; + defm _f6_f4 : VOP3P_Real_MFMA_F8F6F4_gfx940; + defm _f4_f6 : VOP3P_Real_MFMA_F8F6F4_gfx940; + defm _f4_f4 : VOP3P_Real_MFMA_F8F6F4_gfx940; + } +} + +multiclass VOP3PX_Real_ScaledMFMA op> { + defvar PS_ACD = !cast(NAME # "_e64"); + defvar PS_VCD = !cast(NAME # "_vgprcd" # "_e64"); + defvar Name = PS_ACD.Mnemonic; + defvar F8F8Name = !substr(NAME, 0, !sub(!size(NAME), !size("_fN_fM")))#"_f8_f8"; + + let SubtargetPredicate = HasGFX950Insts, + DecoderNamespace = "GFX940", + AsmString = Name # PS_ACD.AsmOperands, Constraints = "" in { + def _gfx940_acd : VOP3P_Real, + VOP3PXe , + MFMA_F8F6F4_WithSizeTable_Helper; + + def _gfx940_vcd : VOP3P_Real, + VOP3PXe , + MFMA_F8F6F4_WithSizeTable_Helper; + } +} + +multiclass VOP3PX_Real_ScaledMFMA_F8F6F4_mc op> { + defm _f8_f8 : VOP3PX_Real_ScaledMFMA; + + let isAsmParserOnly = 1 in { // Disable ambiguous disassembly. + defm _f8_f6 : VOP3PX_Real_ScaledMFMA; + defm _f6_f8 : VOP3PX_Real_ScaledMFMA; + defm _f8_f4 : VOP3PX_Real_ScaledMFMA; + defm _f4_f8 : VOP3PX_Real_ScaledMFMA; + defm _f6_f6 : VOP3PX_Real_ScaledMFMA; + defm _f6_f4 : VOP3PX_Real_ScaledMFMA; + defm _f4_f6 : VOP3PX_Real_ScaledMFMA; + defm _f4_f4 : VOP3PX_Real_ScaledMFMA; + } +} + multiclass VOP3P_Real_MFMA_vi op> { def _vi : VOP3P_Real(NAME#"_e64"), SIEncodingFamily.VI>, VOP3Pe_MAI (NAME#"_e64").Pfl, ?> { @@ -1797,6 +2070,10 @@ defm V_MFMA_F32_32X32X16_F16 : VOP3P_Real_MFMA_gfx950 <0x55, "v_mfma_f32_32x defm V_MFMA_F32_32X32X16_BF16 : VOP3P_Real_MFMA_gfx950 <0x37, "v_mfma_f32_32x32x16_bf16">; defm V_MFMA_LD_SCALE_B32 : VOP3P_Real_vi <0x2c>; +defm V_MFMA_F32_16X16X128_F8F6F4 : VOP3P_Real_MFMA_F8F6F4_gfx950_mc <0x2d, "v_mfma_f32_16x16x128_f8f6f4">; +defm V_MFMA_SCALE_F32_16X16X128_F8F6F4 : VOP3PX_Real_ScaledMFMA_F8F6F4_mc <0x2d>; +defm V_MFMA_F32_32X32X64_F8F6F4 : VOP3P_Real_MFMA_F8F6F4_gfx950_mc <0x2e, "v_mfma_f32_32x32x64_f8f6f4">; +defm V_MFMA_SCALE_F32_32X32X64_F8F6F4 : VOP3PX_Real_ScaledMFMA_F8F6F4_mc <0x2e>; defm V_MFMA_I32_32X32X16I8 : VOP3P_Real_MFMA_gfx940 <0x56, "v_mfma_i32_32x32x16_i8">; defm V_MFMA_I32_16X16X32I8 : VOP3P_Real_MFMA_gfx940 <0x57, "v_mfma_i32_16x16x32_i8">; diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index a6e6adac04e5a..aa9758219db91 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -448,7 +448,7 @@ class VOP3Pe op, VOPProfile P> : Enc64 { let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo) } -class VOP3Pe_MAI op, VOPProfile P, bit acc_cd = 0> : Enc64 { +class VOP3Pe_MAI_Base { bits<8> vdst; bits<10> src0; bits<10> src1; @@ -456,11 +456,13 @@ class VOP3Pe_MAI op, VOPProfile P, bit acc_cd = 0> : Enc64 { bits<3> blgp; bits<3> cbsz; bits<4> abid; +} +class VOP3Pe_MAI op, VOPProfile P, bit acc_cd = 0> : Enc64, VOP3Pe_MAI_Base { let Inst{7-0} = vdst; let Inst{10-8} = !if(P.HasSrc1, cbsz, 0); - let Inst{14-11} = !if(P.HasSrc1, abid, 0); + let Inst{14-11} = !if(P.HasAbid, abid, 0); let Inst{15} = acc_cd; @@ -506,6 +508,59 @@ class VOP3Pe_SMFMAC op> : Enc64 { let Inst{63-61} = blgp; } +class VOP3PXe op, VOPProfile MFMAPfl, bit acc_cd = 0> : Enc128, VOP3Pe_MAI_Base { + bits<9> scale_src0; + bits<9> scale_src1; + + bits<2> scale_src0_opsel; + bits<2> scale_src1_opsel; + + // Inst{7-0} = unused + // Inst{10-8} = neg_hi; + // Inst{13-11} = op_sel + let Inst{11} = scale_src0_opsel{0}; + let Inst{12} = scale_src1_opsel{0}; + // Inst{13} = unused op_sel + // Inst{14} = unused op_sel_hi2 + + let Inst{31-16} = 0b1101001110101100; + let Inst{40-32} = scale_src0; + let Inst{49-41} = scale_src1; + // Inst{50-58} = unused + // Inst{60-59} = op_sel_hi; + let Inst{59} = scale_src0_opsel{1}; + let Inst{60} = scale_src1_opsel{1}; + // Inst{63-61} = neg; + + // The high half of the encoding is the unscaled mfma op. + // + // FIXME: Defining the encoding in terms of the base instruction + // seems to not work, results in all 0 encoding, so replicate all + // the fields from VOP3Pe_MAI, shifted up by 64 + // + // defvar Hi = VOP3Pe_MAI; + // let Inst{127-64} = Hi.Inst; + + let Inst{71-64} = vdst; + let Inst{74-72} = !if(MFMAPfl.HasSrc1, cbsz, 0); + + // abid must be 1 to use a scale. + let Inst{78-75} = 0b0001; // abid + + let Inst{79} = acc_cd; + + let Inst{86-80} = op; + let Inst{95-87} = 0x1a7; //encoding + let Inst{104-96} = !if(MFMAPfl.HasSrc0, src0{8-0}, 0); + let Inst{113-105} = !if(MFMAPfl.HasSrc1, src1{8-0}, 0); + let Inst{122-114} = !if(MFMAPfl.HasSrc2, src2, 0); + + let Inst{123} = !if(MFMAPfl.HasSrc0, src0{9}, 0); // acc(0) + let Inst{124} = !if(MFMAPfl.HasSrc1, src1{9}, 0); // acc(1) + + let Inst{127-125} = !if(MFMAPfl.HasSrc1, blgp, 0); +} + class VOP3Pe_gfx10 op, VOPProfile P> : VOP3Pe { let Inst{31-23} = 0x198; //encoding } @@ -1343,15 +1398,39 @@ class getVOP3ClampPat { } class getVOP3MAIPat { - list ret = !if(!eq(P.Src0VT, P.Src1VT), + list mfma_with_abid = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, + timm:$cbsz, timm:$abid, timm:$blgp))]; + list mfma_no_abid = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, + timm:$cbsz, timm:$blgp))]; + + list ret = !if(!not(P.IsSMFMAC), // mfma - [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, - timm:$cbsz, timm:$abid, timm:$blgp))], + !if(P.HasAbid, mfma_with_abid, mfma_no_abid), + // smfmac [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, i32:$idx, timm:$cbsz, timm:$abid))]); } +class getVOP3MAIScaledPat { + list ret = !if(!not(P.IsSMFMAC), + // mfma + [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, + timm:$cbsz, timm:$blgp, + MFMALdScaleModifierOp:$scale_src0_opsel, + i32:$scale_src0, + MFMALdScaleModifierOp:$scale_src1_opsel, + i32:$scale_src1 + ))], + // smfmac + [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, i32:$idx, + timm:$cbsz, timm:$abid, + MFMALdScaleModifierOp:$scale_src0_opsel, + i32:$scale_src0, + MFMALdScaleModifierOp:$scale_src1_opsel, + i32:$scale_src1))]); +} + class VOP3Features { bit HasClamp = Clamp; bit HasOpSel = OpSel; @@ -1415,7 +1494,7 @@ multiclass VOP3Inst_Pseudo_Wrapper patter def _e64 : VOP3_Pseudo; } -class VOP3InstBase : +class VOP3InstBase : VOP3_Pseudo.ret, !if (P.IsMAI, - getVOP3MAIPat.ret, + !if(MAIScaled, getVOP3MAIScaledPat.ret, getVOP3MAIPat.ret), getVOP3Pat.ret))))), 0, P.HasOpSel> { diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll index 00a3aaf77f900..9769d8b1d910d 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll @@ -285,6 +285,26 @@ define amdgpu_kernel void @mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x bfloa ret void } +declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32>, <8 x i32>, <4 x float>, i32 immarg, i32 immarg, + i32 immarg, i32, i32 immarg, i32) + +; CHECK: DIVERGENT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0, i32 %arg3, i32 immarg 0, i32 %arg4) +define amdgpu_kernel void @mfma_scale_f32_16x16x128_f8f6f4(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 %arg4, ptr addrspace(1) %out) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0, i32 %arg3, i32 immarg 0, i32 %arg4) + store <4 x float> %result, ptr addrspace(1) %out + ret void +} + +declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4(<8 x i32>, <8 x i32>, <16 x float>, i32 immarg, i32 immarg, + i32 immarg, i32, i32 immarg, i32) + +; CHECK: DIVERGENT: %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0, i32 %arg3, i32 immarg 0, i32 %arg4) +define amdgpu_kernel void @mfma_f32_scale_32x32x64_f8f6f4(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 %arg4, ptr addrspace(1) %out) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0, i32 %arg3, i32 immarg 0, i32 %arg4) + store <16 x float> %result, ptr addrspace(1) %out + ret void +} + declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1 declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1) #1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll new file mode 100644 index 0000000000000..9658f8381bff2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll @@ -0,0 +1,2296 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,SDAG %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GISEL %s + +; 0 = fp8 +; 1 = bf8 +; 2 = fp6 +; 3 = bf6 +; 4 = fp4 + +; -------------------------------------------------------------------- +; Different format signatures +; -------------------------------------------------------------------- + +; fp8 x fp8 +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i32 1, i32 %scale0, i32 1, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i32 2, i32 %scale0, i32 2, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i32 3, i32 %scale0, i32 3, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 3, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i32 3, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i32 2, i32 %scale0, i32 3, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i32 3, i32 %scale0, i32 2, i32 %scale1) + ret <4 x float> %result +} + +; This should be optimized to avoid the scale +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 0, 0 op_sel_hi:[0,0,0] +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +; fp8 x bf8 +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:1 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 1, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +; This should be optimized to avoid the scale +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 0, 0 op_sel_hi:[0,0,0] blgp:1 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 1, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +; fp8 x fp6 +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:2 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 2, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +; This should be optimized to avoid the scale +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], 0, 0 op_sel_hi:[0,0,0] blgp:2 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 2, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +; fp8 x bf6 +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:3 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 3, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +; This should be optimized to avoid the scale +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], 0, 0 op_sel_hi:[0,0,0] blgp:3 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 3, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +; fp8 x fp4 +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] blgp:4 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 4, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +; This should be optimized to avoid the scale +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], 0, 0 op_sel_hi:[0,0,0] blgp:4 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 4, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +; bf8 x fp8 +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 1, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +; This should be optimized to avoid the scale +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:1 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 1, ; cbsz + i32 0, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +; bf8 x bf8 +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 blgp:1 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 1, ; cbsz + i32 1, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + + +; This should be optimized to avoid the scale +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:1 blgp:1 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 1, ; cbsz + i32 1, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +; bf8 x fp6 +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:2 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 1, ; cbsz + i32 2, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:1 blgp:2 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 1, ; cbsz + i32 2, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +; bf8 x bf6 +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:3 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 1, ; cbsz + i32 3, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +; This should be optimized to avoid the scale +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:1 blgp:3 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 1, ; cbsz + i32 3, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +; bf8 x fp4 +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:1 blgp:4 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, + i32 1, ; cbsz + i32 4, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +; This should be optimized to avoid the scale +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:1 blgp:4 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, + i32 1, ; cbsz + i32 4, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +; fp6 x fp8 +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 2, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +; This should be optimized to avoid the scale +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:2 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 2, ; cbsz + i32 0, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +; fp6 x bf8 +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 blgp:1 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 2, ; cbsz + i32 1, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +; This should be optimized to avoid the scale +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:2 blgp:1 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 2, ; cbsz + i32 1, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +; fp6 x fp6 +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:2 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 2, ; cbsz + i32 2, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +; This should be optimized to avoid the scale +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:2 blgp:2 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 2, ; cbsz + i32 2, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +; fp6 x bf6 +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:3 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 2, ; cbsz + i32 3, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +; This should be optimized to avoid the scale +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:2 blgp:3 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 2, ; cbsz + i32 3, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + + +; bf6 x fp8 +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 3, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +; This should be optimized to avoid the scale +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:3 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 3, ; cbsz + i32 0, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +; bf6 x bf8 +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 blgp:1 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 3, ; cbsz + i32 1, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +; This should be optimized to avoid the scale +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:3 blgp:1 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 3, ; cbsz + i32 1, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +; bf6 x fp6 +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:2 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 3, ; cbsz + i32 2, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +; This should be optimized to avoid the scale +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:3 blgp:2 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 3, ; cbsz + i32 2, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +; bf6 x fp4 +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v10 +; GCN-NEXT: v_accvgpr_write_b32 a1, v11 +; GCN-NEXT: v_accvgpr_write_b32 a2, v12 +; GCN-NEXT: v_accvgpr_write_b32 a3, v13 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:3 blgp:4 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, + i32 3, ; cbsz + i32 4, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +; This should be optimized to avoid the scale +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v10 +; GCN-NEXT: v_accvgpr_write_b32 a1, v11 +; GCN-NEXT: v_accvgpr_write_b32 a2, v12 +; GCN-NEXT: v_accvgpr_write_b32 a3, v13 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:3 blgp:4 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, + i32 3, ; cbsz + i32 4, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +; bf6 x bf6 +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:3 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 3, ; cbsz + i32 3, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +; This should be optimized to avoid the scale +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:3 blgp:3 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 3, ; cbsz + i32 3, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +; fp6 x fp4 +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v10 +; GCN-NEXT: v_accvgpr_write_b32 a1, v11 +; GCN-NEXT: v_accvgpr_write_b32 a2, v12 +; GCN-NEXT: v_accvgpr_write_b32 a3, v13 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:2 blgp:4 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, + i32 2, ; cbsz + i32 4, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +; This should be optimized to avoid the scale +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v10 +; GCN-NEXT: v_accvgpr_write_b32 a1, v11 +; GCN-NEXT: v_accvgpr_write_b32 a2, v12 +; GCN-NEXT: v_accvgpr_write_b32 a3, v13 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:2 blgp:4 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, + i32 2, ; cbsz + i32 4, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +; fp4 x fp8 +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 4, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +; This should be optimized to avoid the scale +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:4 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 4, ; cbsz + i32 0, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +; fp4 x bf8 +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 blgp:1 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 4, ; cbsz + i32 1, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +; This should be optimized to avoid the scale +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:4 blgp:1 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 4, ; cbsz + i32 1, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +; fp4 x fp6 +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v10 +; GCN-NEXT: v_accvgpr_write_b32 a1, v11 +; GCN-NEXT: v_accvgpr_write_b32 a2, v12 +; GCN-NEXT: v_accvgpr_write_b32 a3, v13 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:2 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 4, ; cbsz + i32 2, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +; This should be optimized to avoid the scale +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v10 +; GCN-NEXT: v_accvgpr_write_b32 a1, v11 +; GCN-NEXT: v_accvgpr_write_b32 a2, v12 +; GCN-NEXT: v_accvgpr_write_b32 a3, v13 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:4 blgp:2 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 4, ; cbsz + i32 2, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +; fp4 x bf6 +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v10 +; GCN-NEXT: v_accvgpr_write_b32 a1, v11 +; GCN-NEXT: v_accvgpr_write_b32 a2, v12 +; GCN-NEXT: v_accvgpr_write_b32 a3, v13 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:3 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 4, ; cbsz + i32 3, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +; This should be optimized to avoid the scale +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v10 +; GCN-NEXT: v_accvgpr_write_b32 a1, v11 +; GCN-NEXT: v_accvgpr_write_b32 a2, v12 +; GCN-NEXT: v_accvgpr_write_b32 a3, v13 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:4 blgp:3 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 4, ; cbsz + i32 3, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +; fp4 x fp4 +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v8 +; GCN-NEXT: v_accvgpr_write_b32 a1, v9 +; GCN-NEXT: v_accvgpr_write_b32 a2, v10 +; GCN-NEXT: v_accvgpr_write_b32 a3, v11 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3], v12, v13 op_sel_hi:[0,0,0] cbsz:4 blgp:4 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, + i32 4, ; cbsz + i32 4, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +; This should be optimized to avoid the scale +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v8 +; GCN-NEXT: v_accvgpr_write_b32 a1, v9 +; GCN-NEXT: v_accvgpr_write_b32 a2, v10 +; GCN-NEXT: v_accvgpr_write_b32 a3, v11 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:4 blgp:4 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, + i32 4, ; cbsz + i32 4, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +; -------------------------------------------------------------------- +; Different input parameter classes +; -------------------------------------------------------------------- + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 inreg %scale0, i32 inreg %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: v_mov_b32_e32 v16, s1 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v16 op_sel_hi:[0,0,0] +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 inreg %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v20 op_sel_hi:[0,0,0] +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 inreg %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, s0 op_sel_hi:[0,0,0] +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x float> inreg %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v20, s28 +; SDAG-NEXT: v_mov_b32_e32 v23, v1 +; SDAG-NEXT: v_mov_b32_e32 v22, v0 +; SDAG-NEXT: v_mov_b32_e32 v21, s29 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v20 +; SDAG-NEXT: v_mov_b32_e32 v4, s20 +; SDAG-NEXT: v_mov_b32_e32 v5, s21 +; SDAG-NEXT: v_mov_b32_e32 v6, s22 +; SDAG-NEXT: v_mov_b32_e32 v7, s23 +; SDAG-NEXT: v_mov_b32_e32 v8, s24 +; SDAG-NEXT: v_mov_b32_e32 v9, s25 +; SDAG-NEXT: v_mov_b32_e32 v10, s26 +; SDAG-NEXT: v_mov_b32_e32 v11, s27 +; SDAG-NEXT: v_mov_b32_e32 v12, s0 +; SDAG-NEXT: v_mov_b32_e32 v13, s1 +; SDAG-NEXT: v_mov_b32_e32 v14, s2 +; SDAG-NEXT: v_mov_b32_e32 v15, s3 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v17, s17 +; SDAG-NEXT: v_mov_b32_e32 v18, s18 +; SDAG-NEXT: v_mov_b32_e32 v19, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v23 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[12:19], v[4:11], a[0:3], v2, v3 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s12, s0 +; GISEL-NEXT: s_mov_b32 s13, s1 +; GISEL-NEXT: v_mov_b32_e32 v20, s28 +; GISEL-NEXT: s_mov_b32 s14, s2 +; GISEL-NEXT: s_mov_b32 s15, s3 +; GISEL-NEXT: v_mov_b32_e32 v22, v0 +; GISEL-NEXT: v_mov_b32_e32 v23, v1 +; GISEL-NEXT: v_mov_b32_e32 v21, s29 +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_accvgpr_write_b32 a0, v20 +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; GISEL-NEXT: v_accvgpr_write_b32 a1, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v23 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[4:11], v[12:19], a[0:3], v2, v3 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 inreg %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 +; SDAG-NEXT: v_mov_b32_e32 v14, s0 +; SDAG-NEXT: v_mov_b32_e32 v15, s1 +; SDAG-NEXT: v_mov_b32_e32 v16, s2 +; SDAG-NEXT: v_mov_b32_e32 v17, s3 +; SDAG-NEXT: v_mov_b32_e32 v18, s16 +; SDAG-NEXT: v_mov_b32_e32 v19, s17 +; SDAG-NEXT: v_mov_b32_e32 v20, s18 +; SDAG-NEXT: v_mov_b32_e32 v21, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], s20, v12 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s12, s0 +; GISEL-NEXT: s_mov_b32 s13, s1 +; GISEL-NEXT: s_mov_b32 s14, s2 +; GISEL-NEXT: s_mov_b32 s15, s3 +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19] +; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] +; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], s20, v12 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 inreg %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 +; SDAG-NEXT: v_mov_b32_e32 v14, s0 +; SDAG-NEXT: v_mov_b32_e32 v15, s1 +; SDAG-NEXT: v_mov_b32_e32 v16, s2 +; SDAG-NEXT: v_mov_b32_e32 v17, s3 +; SDAG-NEXT: v_mov_b32_e32 v18, s16 +; SDAG-NEXT: v_mov_b32_e32 v19, s17 +; SDAG-NEXT: v_mov_b32_e32 v20, s18 +; SDAG-NEXT: v_mov_b32_e32 v21, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, s20 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s12, s0 +; GISEL-NEXT: s_mov_b32 s13, s1 +; GISEL-NEXT: s_mov_b32 s14, s2 +; GISEL-NEXT: s_mov_b32 s15, s3 +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19] +; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] +; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, s20 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr(<8 x i32> %arg0, <8 x i32> inreg %arg1, <4 x float> %arg2, i32 %scale0, i32 inreg %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 +; SDAG-NEXT: v_mov_b32_e32 v14, s0 +; SDAG-NEXT: v_mov_b32_e32 v15, s1 +; SDAG-NEXT: v_mov_b32_e32 v16, s2 +; SDAG-NEXT: v_mov_b32_e32 v17, s3 +; SDAG-NEXT: v_mov_b32_e32 v18, s16 +; SDAG-NEXT: v_mov_b32_e32 v19, s17 +; SDAG-NEXT: v_mov_b32_e32 v20, s18 +; SDAG-NEXT: v_mov_b32_e32 v21, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, s20 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s12, s0 +; GISEL-NEXT: s_mov_b32 s13, s1 +; GISEL-NEXT: s_mov_b32 s14, s2 +; GISEL-NEXT: s_mov_b32 s15, s3 +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19] +; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] +; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, s20 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> inreg %arg2, i32 %scale0, i32 inreg %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, s0 +; GCN-NEXT: v_accvgpr_write_b32 a1, s1 +; GCN-NEXT: v_accvgpr_write_b32 a2, s2 +; GCN-NEXT: v_accvgpr_write_b32 a3, s3 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, s16 op_sel_hi:[0,0,0] +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <4 x float> inreg %arg2, i32 %scale0, i32 inreg %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v10, s0 +; SDAG-NEXT: v_mov_b32_e32 v11, s1 +; SDAG-NEXT: v_mov_b32_e32 v12, s2 +; SDAG-NEXT: v_mov_b32_e32 v13, s3 +; SDAG-NEXT: v_mov_b32_e32 v14, s16 +; SDAG-NEXT: v_mov_b32_e32 v15, s17 +; SDAG-NEXT: v_mov_b32_e32 v16, s18 +; SDAG-NEXT: v_mov_b32_e32 v17, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a0, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s23 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[10:17], v[0:7], a[0:3], v8, s24 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s12, s0 +; GISEL-NEXT: s_mov_b32 s13, s1 +; GISEL-NEXT: s_mov_b32 s14, s2 +; GISEL-NEXT: s_mov_b32 s15, s3 +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[12:13] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s20 +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[18:19] +; GISEL-NEXT: v_accvgpr_write_b32 a1, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s23 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[10:17], v[0:7], a[0:3], v8, s24 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 33, -2 op_sel_hi:[0,0,0] +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 2, i32 33, i32 2, i32 -2) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_movk_i32 s0, 0x41 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, -2 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 -2) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_kimm(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_kimm: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: s_movk_i32 s0, 0x41 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_mov_b32_e32 v16, 0x4d +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v16 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_kimm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v17, 0x4d +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 77) + ret <4 x float> %result +} + +define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1, ptr addrspace(1) %ptr) #0 { +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: v_mov_b32_e32 v4, s12 +; SDAG-NEXT: v_mov_b32_e32 v5, s13 +; SDAG-NEXT: v_mov_b32_e32 v6, s14 +; SDAG-NEXT: v_mov_b32_e32 v7, s15 +; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; SDAG-NEXT: v_mov_b32_e32 v8, s16 +; SDAG-NEXT: v_mov_b32_e32 v9, s17 +; SDAG-NEXT: v_mov_b32_e32 v10, s18 +; SDAG-NEXT: v_mov_b32_e32 v11, s19 +; SDAG-NEXT: v_mov_b32_e32 v12, s20 +; SDAG-NEXT: v_mov_b32_e32 v13, s21 +; SDAG-NEXT: v_mov_b32_e32 v14, s22 +; SDAG-NEXT: v_mov_b32_e32 v15, s23 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s12, v17 op_sel_hi:[0,0,0] blgp:2 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[14:15] +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 +; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x40 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s24 +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: v_accvgpr_write_b32 a1, s25 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s26 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s27 +; GISEL-NEXT: v_mov_b32_e32 v16, s29 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s28, v16 op_sel_hi:[0,0,0] blgp:2 +; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[30:31] +; GISEL-NEXT: s_endpgm + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1) + store <4 x float> %result, ptr addrspace(1) %ptr, align 16 + ret void +} + +define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, ptr addrspace(1) %ptr) #0 { +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 +; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 +; SDAG-NEXT: s_movk_i32 s6, 0x41 +; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: v_mov_b32_e32 v4, s12 +; SDAG-NEXT: v_mov_b32_e32 v5, s13 +; SDAG-NEXT: v_mov_b32_e32 v6, s14 +; SDAG-NEXT: v_mov_b32_e32 v7, s15 +; SDAG-NEXT: v_mov_b32_e32 v8, s16 +; SDAG-NEXT: v_mov_b32_e32 v9, s17 +; SDAG-NEXT: v_mov_b32_e32 v10, s18 +; SDAG-NEXT: v_mov_b32_e32 v11, s19 +; SDAG-NEXT: v_mov_b32_e32 v12, s20 +; SDAG-NEXT: v_mov_b32_e32 v13, s21 +; SDAG-NEXT: v_mov_b32_e32 v14, s22 +; SDAG-NEXT: v_mov_b32_e32 v15, s23 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s6, -2 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5] +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 +; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50 +; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GISEL-NEXT: s_endpgm + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 -2) + store <4 x float> %result, ptr addrspace(1) %ptr, align 16 + ret void +} + +; This should be optimized to avoid the scale +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 0, 0 op_sel_hi:[0,0,0] +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +; This should be optimized to avoid the scale, with non-0 op_sel arguments. +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 0, 0 op_sel_hi:[0,0,0] +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 0, i32 1, i32 0) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 0, 1 op_sel_hi:[0,0,0] +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1, 0 op_sel_hi:[0,0,0] +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0) + ret <4 x float> %result +} + +; -------------------------------------------------------------------- +; Incorrect signature for format cases (IR vector too large) +; -------------------------------------------------------------------- + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:2 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 2, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 2, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2 blgp:2 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 2, ; cbsz + i32 2, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:2 blgp:2 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 2, ; cbsz + i32 2, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:4 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 4, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 4, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:4 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 4, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:4 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 4, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4 blgp:4 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 4, ; cbsz + i32 4, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:4 blgp:4 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 4, ; cbsz + i32 4, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32>, <8 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 +declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32>, <6 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 +declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32>, <4 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 +declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32>, <6 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 +declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32>, <8 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 +declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32>, <4 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 +declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32>, <8 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 +declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 +declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 + +attributes #0 = { "amdgpu-flat-work-group-size"="512,512" } +attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll new file mode 100644 index 0000000000000..67d887d87dd97 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll @@ -0,0 +1,6004 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,SDAG %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GISEL %s + +; 0 = fp8 +; 1 = bf8 +; 2 = fp6 +; 3 = bf6 +; 4 = fp4 + +; -------------------------------------------------------------------- +; Different format signatures +; -------------------------------------------------------------------- + +; fp8 x fp8 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i32 1, i32 %scale0, i32 1, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i32 2, i32 %scale0, i32 2, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i32 3, i32 %scale0, i32 3, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 3, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i32 3, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i32 2, i32 %scale0, i32 3, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i32 3, i32 %scale0, i32 2, i32 %scale1) + ret <16 x float> %result +} + +; This should be optimized to avoid the scale +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword a15, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: v_accvgpr_write_b32 a4, v20 +; GCN-NEXT: v_accvgpr_write_b32 a5, v21 +; GCN-NEXT: v_accvgpr_write_b32 a6, v22 +; GCN-NEXT: v_accvgpr_write_b32 a7, v23 +; GCN-NEXT: v_accvgpr_write_b32 a8, v24 +; GCN-NEXT: v_accvgpr_write_b32 a9, v25 +; GCN-NEXT: v_accvgpr_write_b32 a10, v26 +; GCN-NEXT: v_accvgpr_write_b32 a11, v27 +; GCN-NEXT: v_accvgpr_write_b32 a12, v28 +; GCN-NEXT: v_accvgpr_write_b32 a13, v29 +; GCN-NEXT: v_accvgpr_write_b32 a14, v30 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 0, 0 op_sel_hi:[0,0,0] +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +; fp8 x bf8 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:1 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:1 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 1, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword a15, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: v_accvgpr_write_b32 a4, v20 +; GCN-NEXT: v_accvgpr_write_b32 a5, v21 +; GCN-NEXT: v_accvgpr_write_b32 a6, v22 +; GCN-NEXT: v_accvgpr_write_b32 a7, v23 +; GCN-NEXT: v_accvgpr_write_b32 a8, v24 +; GCN-NEXT: v_accvgpr_write_b32 a9, v25 +; GCN-NEXT: v_accvgpr_write_b32 a10, v26 +; GCN-NEXT: v_accvgpr_write_b32 a11, v27 +; GCN-NEXT: v_accvgpr_write_b32 a12, v28 +; GCN-NEXT: v_accvgpr_write_b32 a13, v29 +; GCN-NEXT: v_accvgpr_write_b32 a14, v30 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 0, 0 op_sel_hi:[0,0,0] blgp:1 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 1, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +; fp8 x fp6 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword v31, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: v_accvgpr_write_b32 a4, v18 +; GCN-NEXT: v_accvgpr_write_b32 a5, v19 +; GCN-NEXT: v_accvgpr_write_b32 a6, v20 +; GCN-NEXT: v_accvgpr_write_b32 a7, v21 +; GCN-NEXT: v_accvgpr_write_b32 a8, v22 +; GCN-NEXT: v_accvgpr_write_b32 a9, v23 +; GCN-NEXT: v_accvgpr_write_b32 a10, v24 +; GCN-NEXT: v_accvgpr_write_b32 a11, v25 +; GCN-NEXT: v_accvgpr_write_b32 a12, v26 +; GCN-NEXT: v_accvgpr_write_b32 a13, v27 +; GCN-NEXT: v_accvgpr_write_b32 a14, v28 +; GCN-NEXT: v_accvgpr_write_b32 a15, v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:2 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 2, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: v_accvgpr_write_b32 a4, v18 +; GCN-NEXT: v_accvgpr_write_b32 a5, v19 +; GCN-NEXT: v_accvgpr_write_b32 a6, v20 +; GCN-NEXT: v_accvgpr_write_b32 a7, v21 +; GCN-NEXT: v_accvgpr_write_b32 a8, v22 +; GCN-NEXT: v_accvgpr_write_b32 a9, v23 +; GCN-NEXT: v_accvgpr_write_b32 a10, v24 +; GCN-NEXT: v_accvgpr_write_b32 a11, v25 +; GCN-NEXT: v_accvgpr_write_b32 a12, v26 +; GCN-NEXT: v_accvgpr_write_b32 a13, v27 +; GCN-NEXT: v_accvgpr_write_b32 a14, v28 +; GCN-NEXT: v_accvgpr_write_b32 a15, v29 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], 0, 0 op_sel_hi:[0,0,0] blgp:2 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 2, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +; fp8 x bf6 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword v31, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: v_accvgpr_write_b32 a4, v18 +; GCN-NEXT: v_accvgpr_write_b32 a5, v19 +; GCN-NEXT: v_accvgpr_write_b32 a6, v20 +; GCN-NEXT: v_accvgpr_write_b32 a7, v21 +; GCN-NEXT: v_accvgpr_write_b32 a8, v22 +; GCN-NEXT: v_accvgpr_write_b32 a9, v23 +; GCN-NEXT: v_accvgpr_write_b32 a10, v24 +; GCN-NEXT: v_accvgpr_write_b32 a11, v25 +; GCN-NEXT: v_accvgpr_write_b32 a12, v26 +; GCN-NEXT: v_accvgpr_write_b32 a13, v27 +; GCN-NEXT: v_accvgpr_write_b32 a14, v28 +; GCN-NEXT: v_accvgpr_write_b32 a15, v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:3 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 3, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: v_accvgpr_write_b32 a4, v18 +; GCN-NEXT: v_accvgpr_write_b32 a5, v19 +; GCN-NEXT: v_accvgpr_write_b32 a6, v20 +; GCN-NEXT: v_accvgpr_write_b32 a7, v21 +; GCN-NEXT: v_accvgpr_write_b32 a8, v22 +; GCN-NEXT: v_accvgpr_write_b32 a9, v23 +; GCN-NEXT: v_accvgpr_write_b32 a10, v24 +; GCN-NEXT: v_accvgpr_write_b32 a11, v25 +; GCN-NEXT: v_accvgpr_write_b32 a12, v26 +; GCN-NEXT: v_accvgpr_write_b32 a13, v27 +; GCN-NEXT: v_accvgpr_write_b32 a14, v28 +; GCN-NEXT: v_accvgpr_write_b32 a15, v29 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], 0, 0 op_sel_hi:[0,0,0] blgp:3 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 3, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +; fp8 x fp4 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] blgp:4 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 4, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], 0, 0 op_sel_hi:[0,0,0] blgp:4 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 4, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +; bf8 x fp8 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:1 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:1 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 1, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword a15, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: v_accvgpr_write_b32 a4, v20 +; GCN-NEXT: v_accvgpr_write_b32 a5, v21 +; GCN-NEXT: v_accvgpr_write_b32 a6, v22 +; GCN-NEXT: v_accvgpr_write_b32 a7, v23 +; GCN-NEXT: v_accvgpr_write_b32 a8, v24 +; GCN-NEXT: v_accvgpr_write_b32 a9, v25 +; GCN-NEXT: v_accvgpr_write_b32 a10, v26 +; GCN-NEXT: v_accvgpr_write_b32 a11, v27 +; GCN-NEXT: v_accvgpr_write_b32 a12, v28 +; GCN-NEXT: v_accvgpr_write_b32 a13, v29 +; GCN-NEXT: v_accvgpr_write_b32 a14, v30 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:1 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 1, ; cbsz + i32 0, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +; bf8 x bf8 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:1 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:1 blgp:1 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 1, ; cbsz + i32 1, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword a15, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: v_accvgpr_write_b32 a4, v20 +; GCN-NEXT: v_accvgpr_write_b32 a5, v21 +; GCN-NEXT: v_accvgpr_write_b32 a6, v22 +; GCN-NEXT: v_accvgpr_write_b32 a7, v23 +; GCN-NEXT: v_accvgpr_write_b32 a8, v24 +; GCN-NEXT: v_accvgpr_write_b32 a9, v25 +; GCN-NEXT: v_accvgpr_write_b32 a10, v26 +; GCN-NEXT: v_accvgpr_write_b32 a11, v27 +; GCN-NEXT: v_accvgpr_write_b32 a12, v28 +; GCN-NEXT: v_accvgpr_write_b32 a13, v29 +; GCN-NEXT: v_accvgpr_write_b32 a14, v30 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:1 blgp:1 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 1, ; cbsz + i32 1, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +; bf8 x fp6 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword v31, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: v_accvgpr_write_b32 a4, v18 +; GCN-NEXT: v_accvgpr_write_b32 a5, v19 +; GCN-NEXT: v_accvgpr_write_b32 a6, v20 +; GCN-NEXT: v_accvgpr_write_b32 a7, v21 +; GCN-NEXT: v_accvgpr_write_b32 a8, v22 +; GCN-NEXT: v_accvgpr_write_b32 a9, v23 +; GCN-NEXT: v_accvgpr_write_b32 a10, v24 +; GCN-NEXT: v_accvgpr_write_b32 a11, v25 +; GCN-NEXT: v_accvgpr_write_b32 a12, v26 +; GCN-NEXT: v_accvgpr_write_b32 a13, v27 +; GCN-NEXT: v_accvgpr_write_b32 a14, v28 +; GCN-NEXT: v_accvgpr_write_b32 a15, v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:2 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 1, ; cbsz + i32 2, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: v_accvgpr_write_b32 a4, v18 +; GCN-NEXT: v_accvgpr_write_b32 a5, v19 +; GCN-NEXT: v_accvgpr_write_b32 a6, v20 +; GCN-NEXT: v_accvgpr_write_b32 a7, v21 +; GCN-NEXT: v_accvgpr_write_b32 a8, v22 +; GCN-NEXT: v_accvgpr_write_b32 a9, v23 +; GCN-NEXT: v_accvgpr_write_b32 a10, v24 +; GCN-NEXT: v_accvgpr_write_b32 a11, v25 +; GCN-NEXT: v_accvgpr_write_b32 a12, v26 +; GCN-NEXT: v_accvgpr_write_b32 a13, v27 +; GCN-NEXT: v_accvgpr_write_b32 a14, v28 +; GCN-NEXT: v_accvgpr_write_b32 a15, v29 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:1 blgp:2 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 1, ; cbsz + i32 2, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +; bf8 x bf6 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword v31, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: v_accvgpr_write_b32 a4, v18 +; GCN-NEXT: v_accvgpr_write_b32 a5, v19 +; GCN-NEXT: v_accvgpr_write_b32 a6, v20 +; GCN-NEXT: v_accvgpr_write_b32 a7, v21 +; GCN-NEXT: v_accvgpr_write_b32 a8, v22 +; GCN-NEXT: v_accvgpr_write_b32 a9, v23 +; GCN-NEXT: v_accvgpr_write_b32 a10, v24 +; GCN-NEXT: v_accvgpr_write_b32 a11, v25 +; GCN-NEXT: v_accvgpr_write_b32 a12, v26 +; GCN-NEXT: v_accvgpr_write_b32 a13, v27 +; GCN-NEXT: v_accvgpr_write_b32 a14, v28 +; GCN-NEXT: v_accvgpr_write_b32 a15, v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:3 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 1, ; cbsz + i32 3, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: v_accvgpr_write_b32 a4, v18 +; GCN-NEXT: v_accvgpr_write_b32 a5, v19 +; GCN-NEXT: v_accvgpr_write_b32 a6, v20 +; GCN-NEXT: v_accvgpr_write_b32 a7, v21 +; GCN-NEXT: v_accvgpr_write_b32 a8, v22 +; GCN-NEXT: v_accvgpr_write_b32 a9, v23 +; GCN-NEXT: v_accvgpr_write_b32 a10, v24 +; GCN-NEXT: v_accvgpr_write_b32 a11, v25 +; GCN-NEXT: v_accvgpr_write_b32 a12, v26 +; GCN-NEXT: v_accvgpr_write_b32 a13, v27 +; GCN-NEXT: v_accvgpr_write_b32 a14, v28 +; GCN-NEXT: v_accvgpr_write_b32 a15, v29 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:1 blgp:3 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 1, ; cbsz + i32 3, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +; bf8 x fp4 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:1 blgp:4 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, + i32 1, ; cbsz + i32 4, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:1 blgp:4 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, + i32 1, ; cbsz + i32 4, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +; fp6 x fp8 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword v31, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: v_accvgpr_write_b32 a4, v18 +; GCN-NEXT: v_accvgpr_write_b32 a5, v19 +; GCN-NEXT: v_accvgpr_write_b32 a6, v20 +; GCN-NEXT: v_accvgpr_write_b32 a7, v21 +; GCN-NEXT: v_accvgpr_write_b32 a8, v22 +; GCN-NEXT: v_accvgpr_write_b32 a9, v23 +; GCN-NEXT: v_accvgpr_write_b32 a10, v24 +; GCN-NEXT: v_accvgpr_write_b32 a11, v25 +; GCN-NEXT: v_accvgpr_write_b32 a12, v26 +; GCN-NEXT: v_accvgpr_write_b32 a13, v27 +; GCN-NEXT: v_accvgpr_write_b32 a14, v28 +; GCN-NEXT: v_accvgpr_write_b32 a15, v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:2 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: v_accvgpr_write_b32 a4, v18 +; GCN-NEXT: v_accvgpr_write_b32 a5, v19 +; GCN-NEXT: v_accvgpr_write_b32 a6, v20 +; GCN-NEXT: v_accvgpr_write_b32 a7, v21 +; GCN-NEXT: v_accvgpr_write_b32 a8, v22 +; GCN-NEXT: v_accvgpr_write_b32 a9, v23 +; GCN-NEXT: v_accvgpr_write_b32 a10, v24 +; GCN-NEXT: v_accvgpr_write_b32 a11, v25 +; GCN-NEXT: v_accvgpr_write_b32 a12, v26 +; GCN-NEXT: v_accvgpr_write_b32 a13, v27 +; GCN-NEXT: v_accvgpr_write_b32 a14, v28 +; GCN-NEXT: v_accvgpr_write_b32 a15, v29 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:2 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 0, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +; fp6 x bf8 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword v31, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: v_accvgpr_write_b32 a4, v18 +; GCN-NEXT: v_accvgpr_write_b32 a5, v19 +; GCN-NEXT: v_accvgpr_write_b32 a6, v20 +; GCN-NEXT: v_accvgpr_write_b32 a7, v21 +; GCN-NEXT: v_accvgpr_write_b32 a8, v22 +; GCN-NEXT: v_accvgpr_write_b32 a9, v23 +; GCN-NEXT: v_accvgpr_write_b32 a10, v24 +; GCN-NEXT: v_accvgpr_write_b32 a11, v25 +; GCN-NEXT: v_accvgpr_write_b32 a12, v26 +; GCN-NEXT: v_accvgpr_write_b32 a13, v27 +; GCN-NEXT: v_accvgpr_write_b32 a14, v28 +; GCN-NEXT: v_accvgpr_write_b32 a15, v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:2 blgp:1 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 1, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: v_accvgpr_write_b32 a4, v18 +; GCN-NEXT: v_accvgpr_write_b32 a5, v19 +; GCN-NEXT: v_accvgpr_write_b32 a6, v20 +; GCN-NEXT: v_accvgpr_write_b32 a7, v21 +; GCN-NEXT: v_accvgpr_write_b32 a8, v22 +; GCN-NEXT: v_accvgpr_write_b32 a9, v23 +; GCN-NEXT: v_accvgpr_write_b32 a10, v24 +; GCN-NEXT: v_accvgpr_write_b32 a11, v25 +; GCN-NEXT: v_accvgpr_write_b32 a12, v26 +; GCN-NEXT: v_accvgpr_write_b32 a13, v27 +; GCN-NEXT: v_accvgpr_write_b32 a14, v28 +; GCN-NEXT: v_accvgpr_write_b32 a15, v29 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:2 blgp:1 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 1, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +; fp6 x fp6 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:2 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 2, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:2 blgp:2 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 2, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +; fp6 x bf6 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:3 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 3, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:2 blgp:3 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 3, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + + +; bf6 x fp8 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword v31, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: v_accvgpr_write_b32 a4, v18 +; GCN-NEXT: v_accvgpr_write_b32 a5, v19 +; GCN-NEXT: v_accvgpr_write_b32 a6, v20 +; GCN-NEXT: v_accvgpr_write_b32 a7, v21 +; GCN-NEXT: v_accvgpr_write_b32 a8, v22 +; GCN-NEXT: v_accvgpr_write_b32 a9, v23 +; GCN-NEXT: v_accvgpr_write_b32 a10, v24 +; GCN-NEXT: v_accvgpr_write_b32 a11, v25 +; GCN-NEXT: v_accvgpr_write_b32 a12, v26 +; GCN-NEXT: v_accvgpr_write_b32 a13, v27 +; GCN-NEXT: v_accvgpr_write_b32 a14, v28 +; GCN-NEXT: v_accvgpr_write_b32 a15, v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:3 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 3, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: v_accvgpr_write_b32 a4, v18 +; GCN-NEXT: v_accvgpr_write_b32 a5, v19 +; GCN-NEXT: v_accvgpr_write_b32 a6, v20 +; GCN-NEXT: v_accvgpr_write_b32 a7, v21 +; GCN-NEXT: v_accvgpr_write_b32 a8, v22 +; GCN-NEXT: v_accvgpr_write_b32 a9, v23 +; GCN-NEXT: v_accvgpr_write_b32 a10, v24 +; GCN-NEXT: v_accvgpr_write_b32 a11, v25 +; GCN-NEXT: v_accvgpr_write_b32 a12, v26 +; GCN-NEXT: v_accvgpr_write_b32 a13, v27 +; GCN-NEXT: v_accvgpr_write_b32 a14, v28 +; GCN-NEXT: v_accvgpr_write_b32 a15, v29 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:3 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 3, ; cbsz + i32 0, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +; bf6 x bf8 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword v31, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: v_accvgpr_write_b32 a4, v18 +; GCN-NEXT: v_accvgpr_write_b32 a5, v19 +; GCN-NEXT: v_accvgpr_write_b32 a6, v20 +; GCN-NEXT: v_accvgpr_write_b32 a7, v21 +; GCN-NEXT: v_accvgpr_write_b32 a8, v22 +; GCN-NEXT: v_accvgpr_write_b32 a9, v23 +; GCN-NEXT: v_accvgpr_write_b32 a10, v24 +; GCN-NEXT: v_accvgpr_write_b32 a11, v25 +; GCN-NEXT: v_accvgpr_write_b32 a12, v26 +; GCN-NEXT: v_accvgpr_write_b32 a13, v27 +; GCN-NEXT: v_accvgpr_write_b32 a14, v28 +; GCN-NEXT: v_accvgpr_write_b32 a15, v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:3 blgp:1 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 3, ; cbsz + i32 1, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: v_accvgpr_write_b32 a4, v18 +; GCN-NEXT: v_accvgpr_write_b32 a5, v19 +; GCN-NEXT: v_accvgpr_write_b32 a6, v20 +; GCN-NEXT: v_accvgpr_write_b32 a7, v21 +; GCN-NEXT: v_accvgpr_write_b32 a8, v22 +; GCN-NEXT: v_accvgpr_write_b32 a9, v23 +; GCN-NEXT: v_accvgpr_write_b32 a10, v24 +; GCN-NEXT: v_accvgpr_write_b32 a11, v25 +; GCN-NEXT: v_accvgpr_write_b32 a12, v26 +; GCN-NEXT: v_accvgpr_write_b32 a13, v27 +; GCN-NEXT: v_accvgpr_write_b32 a14, v28 +; GCN-NEXT: v_accvgpr_write_b32 a15, v29 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:3 blgp:1 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 3, ; cbsz + i32 1, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +; bf6 x fp6 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:2 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 3, ; cbsz + i32 2, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:3 blgp:2 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 3, ; cbsz + i32 2, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +; bf6 x fp4 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v10 +; GCN-NEXT: v_accvgpr_write_b32 a1, v11 +; GCN-NEXT: v_accvgpr_write_b32 a2, v12 +; GCN-NEXT: v_accvgpr_write_b32 a3, v13 +; GCN-NEXT: v_accvgpr_write_b32 a4, v14 +; GCN-NEXT: v_accvgpr_write_b32 a5, v15 +; GCN-NEXT: v_accvgpr_write_b32 a6, v16 +; GCN-NEXT: v_accvgpr_write_b32 a7, v17 +; GCN-NEXT: v_accvgpr_write_b32 a8, v18 +; GCN-NEXT: v_accvgpr_write_b32 a9, v19 +; GCN-NEXT: v_accvgpr_write_b32 a10, v20 +; GCN-NEXT: v_accvgpr_write_b32 a11, v21 +; GCN-NEXT: v_accvgpr_write_b32 a12, v22 +; GCN-NEXT: v_accvgpr_write_b32 a13, v23 +; GCN-NEXT: v_accvgpr_write_b32 a14, v24 +; GCN-NEXT: v_accvgpr_write_b32 a15, v25 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:3 blgp:4 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, + i32 3, ; cbsz + i32 4, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v10 +; GCN-NEXT: v_accvgpr_write_b32 a1, v11 +; GCN-NEXT: v_accvgpr_write_b32 a2, v12 +; GCN-NEXT: v_accvgpr_write_b32 a3, v13 +; GCN-NEXT: v_accvgpr_write_b32 a4, v14 +; GCN-NEXT: v_accvgpr_write_b32 a5, v15 +; GCN-NEXT: v_accvgpr_write_b32 a6, v16 +; GCN-NEXT: v_accvgpr_write_b32 a7, v17 +; GCN-NEXT: v_accvgpr_write_b32 a8, v18 +; GCN-NEXT: v_accvgpr_write_b32 a9, v19 +; GCN-NEXT: v_accvgpr_write_b32 a10, v20 +; GCN-NEXT: v_accvgpr_write_b32 a11, v21 +; GCN-NEXT: v_accvgpr_write_b32 a12, v22 +; GCN-NEXT: v_accvgpr_write_b32 a13, v23 +; GCN-NEXT: v_accvgpr_write_b32 a14, v24 +; GCN-NEXT: v_accvgpr_write_b32 a15, v25 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:3 blgp:4 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, + i32 3, ; cbsz + i32 4, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +; bf6 x bf6 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:3 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 3, ; cbsz + i32 3, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:3 blgp:3 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 3, ; cbsz + i32 3, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +; fp6 x fp4 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v10 +; GCN-NEXT: v_accvgpr_write_b32 a1, v11 +; GCN-NEXT: v_accvgpr_write_b32 a2, v12 +; GCN-NEXT: v_accvgpr_write_b32 a3, v13 +; GCN-NEXT: v_accvgpr_write_b32 a4, v14 +; GCN-NEXT: v_accvgpr_write_b32 a5, v15 +; GCN-NEXT: v_accvgpr_write_b32 a6, v16 +; GCN-NEXT: v_accvgpr_write_b32 a7, v17 +; GCN-NEXT: v_accvgpr_write_b32 a8, v18 +; GCN-NEXT: v_accvgpr_write_b32 a9, v19 +; GCN-NEXT: v_accvgpr_write_b32 a10, v20 +; GCN-NEXT: v_accvgpr_write_b32 a11, v21 +; GCN-NEXT: v_accvgpr_write_b32 a12, v22 +; GCN-NEXT: v_accvgpr_write_b32 a13, v23 +; GCN-NEXT: v_accvgpr_write_b32 a14, v24 +; GCN-NEXT: v_accvgpr_write_b32 a15, v25 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:2 blgp:4 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 4, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v10 +; GCN-NEXT: v_accvgpr_write_b32 a1, v11 +; GCN-NEXT: v_accvgpr_write_b32 a2, v12 +; GCN-NEXT: v_accvgpr_write_b32 a3, v13 +; GCN-NEXT: v_accvgpr_write_b32 a4, v14 +; GCN-NEXT: v_accvgpr_write_b32 a5, v15 +; GCN-NEXT: v_accvgpr_write_b32 a6, v16 +; GCN-NEXT: v_accvgpr_write_b32 a7, v17 +; GCN-NEXT: v_accvgpr_write_b32 a8, v18 +; GCN-NEXT: v_accvgpr_write_b32 a9, v19 +; GCN-NEXT: v_accvgpr_write_b32 a10, v20 +; GCN-NEXT: v_accvgpr_write_b32 a11, v21 +; GCN-NEXT: v_accvgpr_write_b32 a12, v22 +; GCN-NEXT: v_accvgpr_write_b32 a13, v23 +; GCN-NEXT: v_accvgpr_write_b32 a14, v24 +; GCN-NEXT: v_accvgpr_write_b32 a15, v25 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:2 blgp:4 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 4, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +; fp4 x fp8 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:4 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 0, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +; fp4 x bf8 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 blgp:1 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 1, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:4 blgp:1 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 1, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +; fp4 x fp6 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v10 +; GCN-NEXT: v_accvgpr_write_b32 a1, v11 +; GCN-NEXT: v_accvgpr_write_b32 a2, v12 +; GCN-NEXT: v_accvgpr_write_b32 a3, v13 +; GCN-NEXT: v_accvgpr_write_b32 a4, v14 +; GCN-NEXT: v_accvgpr_write_b32 a5, v15 +; GCN-NEXT: v_accvgpr_write_b32 a6, v16 +; GCN-NEXT: v_accvgpr_write_b32 a7, v17 +; GCN-NEXT: v_accvgpr_write_b32 a8, v18 +; GCN-NEXT: v_accvgpr_write_b32 a9, v19 +; GCN-NEXT: v_accvgpr_write_b32 a10, v20 +; GCN-NEXT: v_accvgpr_write_b32 a11, v21 +; GCN-NEXT: v_accvgpr_write_b32 a12, v22 +; GCN-NEXT: v_accvgpr_write_b32 a13, v23 +; GCN-NEXT: v_accvgpr_write_b32 a14, v24 +; GCN-NEXT: v_accvgpr_write_b32 a15, v25 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:2 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 2, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v10 +; GCN-NEXT: v_accvgpr_write_b32 a1, v11 +; GCN-NEXT: v_accvgpr_write_b32 a2, v12 +; GCN-NEXT: v_accvgpr_write_b32 a3, v13 +; GCN-NEXT: v_accvgpr_write_b32 a4, v14 +; GCN-NEXT: v_accvgpr_write_b32 a5, v15 +; GCN-NEXT: v_accvgpr_write_b32 a6, v16 +; GCN-NEXT: v_accvgpr_write_b32 a7, v17 +; GCN-NEXT: v_accvgpr_write_b32 a8, v18 +; GCN-NEXT: v_accvgpr_write_b32 a9, v19 +; GCN-NEXT: v_accvgpr_write_b32 a10, v20 +; GCN-NEXT: v_accvgpr_write_b32 a11, v21 +; GCN-NEXT: v_accvgpr_write_b32 a12, v22 +; GCN-NEXT: v_accvgpr_write_b32 a13, v23 +; GCN-NEXT: v_accvgpr_write_b32 a14, v24 +; GCN-NEXT: v_accvgpr_write_b32 a15, v25 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:4 blgp:2 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 2, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +; fp4 x bf6 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v10 +; GCN-NEXT: v_accvgpr_write_b32 a1, v11 +; GCN-NEXT: v_accvgpr_write_b32 a2, v12 +; GCN-NEXT: v_accvgpr_write_b32 a3, v13 +; GCN-NEXT: v_accvgpr_write_b32 a4, v14 +; GCN-NEXT: v_accvgpr_write_b32 a5, v15 +; GCN-NEXT: v_accvgpr_write_b32 a6, v16 +; GCN-NEXT: v_accvgpr_write_b32 a7, v17 +; GCN-NEXT: v_accvgpr_write_b32 a8, v18 +; GCN-NEXT: v_accvgpr_write_b32 a9, v19 +; GCN-NEXT: v_accvgpr_write_b32 a10, v20 +; GCN-NEXT: v_accvgpr_write_b32 a11, v21 +; GCN-NEXT: v_accvgpr_write_b32 a12, v22 +; GCN-NEXT: v_accvgpr_write_b32 a13, v23 +; GCN-NEXT: v_accvgpr_write_b32 a14, v24 +; GCN-NEXT: v_accvgpr_write_b32 a15, v25 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:3 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 3, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v10 +; GCN-NEXT: v_accvgpr_write_b32 a1, v11 +; GCN-NEXT: v_accvgpr_write_b32 a2, v12 +; GCN-NEXT: v_accvgpr_write_b32 a3, v13 +; GCN-NEXT: v_accvgpr_write_b32 a4, v14 +; GCN-NEXT: v_accvgpr_write_b32 a5, v15 +; GCN-NEXT: v_accvgpr_write_b32 a6, v16 +; GCN-NEXT: v_accvgpr_write_b32 a7, v17 +; GCN-NEXT: v_accvgpr_write_b32 a8, v18 +; GCN-NEXT: v_accvgpr_write_b32 a9, v19 +; GCN-NEXT: v_accvgpr_write_b32 a10, v20 +; GCN-NEXT: v_accvgpr_write_b32 a11, v21 +; GCN-NEXT: v_accvgpr_write_b32 a12, v22 +; GCN-NEXT: v_accvgpr_write_b32 a13, v23 +; GCN-NEXT: v_accvgpr_write_b32 a14, v24 +; GCN-NEXT: v_accvgpr_write_b32 a15, v25 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:4 blgp:3 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 3, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +; fp4 x fp4 +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v8 +; GCN-NEXT: v_accvgpr_write_b32 a1, v9 +; GCN-NEXT: v_accvgpr_write_b32 a2, v10 +; GCN-NEXT: v_accvgpr_write_b32 a3, v11 +; GCN-NEXT: v_accvgpr_write_b32 a4, v12 +; GCN-NEXT: v_accvgpr_write_b32 a5, v13 +; GCN-NEXT: v_accvgpr_write_b32 a6, v14 +; GCN-NEXT: v_accvgpr_write_b32 a7, v15 +; GCN-NEXT: v_accvgpr_write_b32 a8, v16 +; GCN-NEXT: v_accvgpr_write_b32 a9, v17 +; GCN-NEXT: v_accvgpr_write_b32 a10, v18 +; GCN-NEXT: v_accvgpr_write_b32 a11, v19 +; GCN-NEXT: v_accvgpr_write_b32 a12, v20 +; GCN-NEXT: v_accvgpr_write_b32 a13, v21 +; GCN-NEXT: v_accvgpr_write_b32 a14, v22 +; GCN-NEXT: v_accvgpr_write_b32 a15, v23 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 4, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v8 +; GCN-NEXT: v_accvgpr_write_b32 a1, v9 +; GCN-NEXT: v_accvgpr_write_b32 a2, v10 +; GCN-NEXT: v_accvgpr_write_b32 a3, v11 +; GCN-NEXT: v_accvgpr_write_b32 a4, v12 +; GCN-NEXT: v_accvgpr_write_b32 a5, v13 +; GCN-NEXT: v_accvgpr_write_b32 a6, v14 +; GCN-NEXT: v_accvgpr_write_b32 a7, v15 +; GCN-NEXT: v_accvgpr_write_b32 a8, v16 +; GCN-NEXT: v_accvgpr_write_b32 a9, v17 +; GCN-NEXT: v_accvgpr_write_b32 a10, v18 +; GCN-NEXT: v_accvgpr_write_b32 a11, v19 +; GCN-NEXT: v_accvgpr_write_b32 a12, v20 +; GCN-NEXT: v_accvgpr_write_b32 a13, v21 +; GCN-NEXT: v_accvgpr_write_b32 a14, v22 +; GCN-NEXT: v_accvgpr_write_b32 a15, v23 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:4 blgp:4 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 4, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +; -------------------------------------------------------------------- +; Different input parameter classes +; -------------------------------------------------------------------- + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 inreg %scale0, i32 inreg %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword a15, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: v_accvgpr_write_b32 a4, v20 +; GCN-NEXT: v_accvgpr_write_b32 a5, v21 +; GCN-NEXT: v_accvgpr_write_b32 a6, v22 +; GCN-NEXT: v_accvgpr_write_b32 a7, v23 +; GCN-NEXT: v_accvgpr_write_b32 a8, v24 +; GCN-NEXT: v_accvgpr_write_b32 a9, v25 +; GCN-NEXT: v_accvgpr_write_b32 a10, v26 +; GCN-NEXT: v_accvgpr_write_b32 a11, v27 +; GCN-NEXT: v_accvgpr_write_b32 a12, v28 +; GCN-NEXT: v_accvgpr_write_b32 a13, v29 +; GCN-NEXT: v_accvgpr_write_b32 a14, v30 +; GCN-NEXT: v_mov_b32_e32 v16, s1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel_hi:[0,0,0] +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 inreg %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword a15, off, s32 +; GCN-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: v_accvgpr_write_b32 a4, v20 +; GCN-NEXT: v_accvgpr_write_b32 a5, v21 +; GCN-NEXT: v_accvgpr_write_b32 a6, v22 +; GCN-NEXT: v_accvgpr_write_b32 a7, v23 +; GCN-NEXT: v_accvgpr_write_b32 a8, v24 +; GCN-NEXT: v_accvgpr_write_b32 a9, v25 +; GCN-NEXT: v_accvgpr_write_b32 a10, v26 +; GCN-NEXT: v_accvgpr_write_b32 a11, v27 +; GCN-NEXT: v_accvgpr_write_b32 a12, v28 +; GCN-NEXT: v_accvgpr_write_b32 a13, v29 +; GCN-NEXT: v_accvgpr_write_b32 a14, v30 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v31 op_sel_hi:[0,0,0] +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 inreg %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword a15, off, s32 +; GCN-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: v_accvgpr_write_b32 a4, v20 +; GCN-NEXT: v_accvgpr_write_b32 a5, v21 +; GCN-NEXT: v_accvgpr_write_b32 a6, v22 +; GCN-NEXT: v_accvgpr_write_b32 a7, v23 +; GCN-NEXT: v_accvgpr_write_b32 a8, v24 +; GCN-NEXT: v_accvgpr_write_b32 a9, v25 +; GCN-NEXT: v_accvgpr_write_b32 a10, v26 +; GCN-NEXT: v_accvgpr_write_b32 a11, v27 +; GCN-NEXT: v_accvgpr_write_b32 a12, v28 +; GCN-NEXT: v_accvgpr_write_b32 a13, v29 +; GCN-NEXT: v_accvgpr_write_b32 a14, v30 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, s0 op_sel_hi:[0,0,0] +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v16, s28 +; SDAG-NEXT: v_mov_b32_e32 v31, v13 +; SDAG-NEXT: v_mov_b32_e32 v30, v12 +; SDAG-NEXT: v_mov_b32_e32 v29, v11 +; SDAG-NEXT: v_mov_b32_e32 v28, v10 +; SDAG-NEXT: v_mov_b32_e32 v27, v9 +; SDAG-NEXT: v_mov_b32_e32 v26, v8 +; SDAG-NEXT: v_mov_b32_e32 v25, v7 +; SDAG-NEXT: v_mov_b32_e32 v24, v6 +; SDAG-NEXT: v_mov_b32_e32 v23, v5 +; SDAG-NEXT: v_mov_b32_e32 v22, v4 +; SDAG-NEXT: v_mov_b32_e32 v21, v3 +; SDAG-NEXT: v_mov_b32_e32 v20, v2 +; SDAG-NEXT: v_mov_b32_e32 v19, v1 +; SDAG-NEXT: v_mov_b32_e32 v18, v0 +; SDAG-NEXT: v_mov_b32_e32 v17, s29 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_mov_b32_e32 v0, s20 +; SDAG-NEXT: v_mov_b32_e32 v1, s21 +; SDAG-NEXT: v_mov_b32_e32 v2, s22 +; SDAG-NEXT: v_mov_b32_e32 v3, s23 +; SDAG-NEXT: v_mov_b32_e32 v4, s24 +; SDAG-NEXT: v_mov_b32_e32 v5, s25 +; SDAG-NEXT: v_mov_b32_e32 v6, s26 +; SDAG-NEXT: v_mov_b32_e32 v7, s27 +; SDAG-NEXT: v_mov_b32_e32 v32, s0 +; SDAG-NEXT: v_mov_b32_e32 v33, s1 +; SDAG-NEXT: v_mov_b32_e32 v34, s2 +; SDAG-NEXT: v_mov_b32_e32 v35, s3 +; SDAG-NEXT: v_mov_b32_e32 v36, s16 +; SDAG-NEXT: v_mov_b32_e32 v37, s17 +; SDAG-NEXT: v_mov_b32_e32 v38, s18 +; SDAG-NEXT: v_mov_b32_e32 v39, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v31 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[32:39], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s12, s0 +; GISEL-NEXT: s_mov_b32 s13, s1 +; GISEL-NEXT: v_mov_b32_e32 v16, s28 +; GISEL-NEXT: s_mov_b32 s14, s2 +; GISEL-NEXT: s_mov_b32 s15, s3 +; GISEL-NEXT: v_mov_b32_e32 v18, v0 +; GISEL-NEXT: v_mov_b32_e32 v19, v1 +; GISEL-NEXT: v_mov_b32_e32 v20, v2 +; GISEL-NEXT: v_mov_b32_e32 v21, v3 +; GISEL-NEXT: v_mov_b32_e32 v22, v4 +; GISEL-NEXT: v_mov_b32_e32 v23, v5 +; GISEL-NEXT: v_mov_b32_e32 v24, v6 +; GISEL-NEXT: v_mov_b32_e32 v25, v7 +; GISEL-NEXT: v_mov_b32_e32 v26, v8 +; GISEL-NEXT: v_mov_b32_e32 v27, v9 +; GISEL-NEXT: v_mov_b32_e32 v28, v10 +; GISEL-NEXT: v_mov_b32_e32 v29, v11 +; GISEL-NEXT: v_mov_b32_e32 v30, v12 +; GISEL-NEXT: v_mov_b32_e32 v31, v13 +; GISEL-NEXT: v_mov_b32_e32 v17, s29 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[26:27] +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[20:21] +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v31 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[32:39], a[0:15], v14, v15 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 inreg %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 +; SDAG-NEXT: v_mov_b32_e32 v26, s0 +; SDAG-NEXT: v_mov_b32_e32 v27, s1 +; SDAG-NEXT: v_mov_b32_e32 v28, s2 +; SDAG-NEXT: v_mov_b32_e32 v29, s3 +; SDAG-NEXT: v_mov_b32_e32 v30, s16 +; SDAG-NEXT: v_mov_b32_e32 v31, s17 +; SDAG-NEXT: v_mov_b32_e32 v32, s18 +; SDAG-NEXT: v_mov_b32_e32 v33, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], s20, v24 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s12, s0 +; GISEL-NEXT: s_mov_b32 s13, s1 +; GISEL-NEXT: s_mov_b32 s14, s2 +; GISEL-NEXT: s_mov_b32 s15, s3 +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[18:19] +; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[12:13] +; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], s20, v24 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 inreg %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 +; SDAG-NEXT: v_mov_b32_e32 v26, s0 +; SDAG-NEXT: v_mov_b32_e32 v27, s1 +; SDAG-NEXT: v_mov_b32_e32 v28, s2 +; SDAG-NEXT: v_mov_b32_e32 v29, s3 +; SDAG-NEXT: v_mov_b32_e32 v30, s16 +; SDAG-NEXT: v_mov_b32_e32 v31, s17 +; SDAG-NEXT: v_mov_b32_e32 v32, s18 +; SDAG-NEXT: v_mov_b32_e32 v33, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, s20 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s12, s0 +; GISEL-NEXT: s_mov_b32 s13, s1 +; GISEL-NEXT: s_mov_b32 s14, s2 +; GISEL-NEXT: s_mov_b32 s15, s3 +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[18:19] +; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[12:13] +; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, s20 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr(<8 x i32> %arg0, <8 x i32> inreg %arg1, <16 x float> %arg2, i32 %scale0, i32 inreg %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 +; SDAG-NEXT: v_mov_b32_e32 v26, s0 +; SDAG-NEXT: v_mov_b32_e32 v27, s1 +; SDAG-NEXT: v_mov_b32_e32 v28, s2 +; SDAG-NEXT: v_mov_b32_e32 v29, s3 +; SDAG-NEXT: v_mov_b32_e32 v30, s16 +; SDAG-NEXT: v_mov_b32_e32 v31, s17 +; SDAG-NEXT: v_mov_b32_e32 v32, s18 +; SDAG-NEXT: v_mov_b32_e32 v33, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, s20 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s12, s0 +; GISEL-NEXT: s_mov_b32 s13, s1 +; GISEL-NEXT: s_mov_b32 s14, s2 +; GISEL-NEXT: s_mov_b32 s15, s3 +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[18:19] +; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[12:13] +; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, s20 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> inreg %arg2, i32 %scale0, i32 inreg %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; SDAG-NEXT: v_accvgpr_write_b32 a4, s16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, s18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, s23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, s24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, s25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, s26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, s27 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, s28 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s12, s0 +; GISEL-NEXT: s_mov_b32 s13, s1 +; GISEL-NEXT: s_mov_b32 s14, s2 +; GISEL-NEXT: s_mov_b32 s15, s3 +; GISEL-NEXT: v_accvgpr_write_b32 a0, s12 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s13 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s14 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s15 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s16 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s17 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s18 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s19 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s23 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s24 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s25 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s26 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s27 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, s28 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <16 x float> inreg %arg2, i32 %scale0, i32 inreg %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v16, s20 +; SDAG-NEXT: v_mov_b32_e32 v31, v13 +; SDAG-NEXT: v_mov_b32_e32 v30, v12 +; SDAG-NEXT: v_mov_b32_e32 v29, v11 +; SDAG-NEXT: v_mov_b32_e32 v28, v10 +; SDAG-NEXT: v_mov_b32_e32 v27, v9 +; SDAG-NEXT: v_mov_b32_e32 v26, v8 +; SDAG-NEXT: v_mov_b32_e32 v17, s21 +; SDAG-NEXT: v_mov_b32_e32 v18, s22 +; SDAG-NEXT: v_mov_b32_e32 v19, s23 +; SDAG-NEXT: v_mov_b32_e32 v20, s24 +; SDAG-NEXT: v_mov_b32_e32 v21, s25 +; SDAG-NEXT: v_mov_b32_e32 v22, s26 +; SDAG-NEXT: v_mov_b32_e32 v23, s27 +; SDAG-NEXT: v_mov_b32_e32 v24, s28 +; SDAG-NEXT: v_mov_b32_e32 v25, s29 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_mov_b32_e32 v32, s0 +; SDAG-NEXT: v_mov_b32_e32 v33, s1 +; SDAG-NEXT: v_mov_b32_e32 v34, s2 +; SDAG-NEXT: v_mov_b32_e32 v35, s3 +; SDAG-NEXT: v_mov_b32_e32 v36, s16 +; SDAG-NEXT: v_mov_b32_e32 v37, s17 +; SDAG-NEXT: v_mov_b32_e32 v38, s18 +; SDAG-NEXT: v_mov_b32_e32 v39, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v31 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[32:39], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v16, s20 +; GISEL-NEXT: s_mov_b32 s12, s0 +; GISEL-NEXT: s_mov_b32 s13, s1 +; GISEL-NEXT: s_mov_b32 s14, s2 +; GISEL-NEXT: s_mov_b32 s15, s3 +; GISEL-NEXT: v_mov_b32_e32 v26, v8 +; GISEL-NEXT: v_mov_b32_e32 v27, v9 +; GISEL-NEXT: v_mov_b32_e32 v28, v10 +; GISEL-NEXT: v_mov_b32_e32 v29, v11 +; GISEL-NEXT: v_mov_b32_e32 v30, v12 +; GISEL-NEXT: v_mov_b32_e32 v31, v13 +; GISEL-NEXT: v_mov_b32_e32 v17, s21 +; GISEL-NEXT: v_mov_b32_e32 v18, s22 +; GISEL-NEXT: v_mov_b32_e32 v19, s23 +; GISEL-NEXT: v_mov_b32_e32 v20, s24 +; GISEL-NEXT: v_mov_b32_e32 v21, s25 +; GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GISEL-NEXT: v_mov_b32_e32 v23, s27 +; GISEL-NEXT: v_mov_b32_e32 v24, s28 +; GISEL-NEXT: v_mov_b32_e32 v25, s29 +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[18:19] +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[12:13] +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v31 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[32:39], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword a15, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: v_accvgpr_write_b32 a4, v20 +; GCN-NEXT: v_accvgpr_write_b32 a5, v21 +; GCN-NEXT: v_accvgpr_write_b32 a6, v22 +; GCN-NEXT: v_accvgpr_write_b32 a7, v23 +; GCN-NEXT: v_accvgpr_write_b32 a8, v24 +; GCN-NEXT: v_accvgpr_write_b32 a9, v25 +; GCN-NEXT: v_accvgpr_write_b32 a10, v26 +; GCN-NEXT: v_accvgpr_write_b32 a11, v27 +; GCN-NEXT: v_accvgpr_write_b32 a12, v28 +; GCN-NEXT: v_accvgpr_write_b32 a13, v29 +; GCN-NEXT: v_accvgpr_write_b32 a14, v30 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 33, -2 op_sel_hi:[0,0,0] +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 33, i32 2, i32 -2) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: s_movk_i32 s0, 0x41 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, -2 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: v_mov_b32_e32 v31, 0x41 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, -2 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 -2) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: s_movk_i32 s0, 0x41 +; SDAG-NEXT: v_mov_b32_e32 v31, 0x4d +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v31 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: v_mov_b32_e32 v31, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v32, 0x4d +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 77) + ret <16 x float> %result +} + +define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1, ptr addrspace(1) %ptr) #0 { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 +; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, s36 +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: v_mov_b32_e32 v4, s12 +; SDAG-NEXT: v_mov_b32_e32 v5, s13 +; SDAG-NEXT: v_mov_b32_e32 v6, s14 +; SDAG-NEXT: v_mov_b32_e32 v7, s15 +; SDAG-NEXT: v_mov_b32_e32 v8, s16 +; SDAG-NEXT: v_mov_b32_e32 v9, s17 +; SDAG-NEXT: v_mov_b32_e32 v10, s18 +; SDAG-NEXT: v_mov_b32_e32 v11, s19 +; SDAG-NEXT: v_mov_b32_e32 v12, s20 +; SDAG-NEXT: v_mov_b32_e32 v13, s21 +; SDAG-NEXT: v_mov_b32_e32 v14, s22 +; SDAG-NEXT: v_mov_b32_e32 v15, s23 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s37 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s38 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s39 +; SDAG-NEXT: v_accvgpr_write_b32 a4, s40 +; SDAG-NEXT: v_accvgpr_write_b32 a5, s41 +; SDAG-NEXT: v_accvgpr_write_b32 a6, s42 +; SDAG-NEXT: v_accvgpr_write_b32 a7, s43 +; SDAG-NEXT: v_accvgpr_write_b32 a8, s44 +; SDAG-NEXT: v_accvgpr_write_b32 a9, s45 +; SDAG-NEXT: v_accvgpr_write_b32 a10, s46 +; SDAG-NEXT: v_accvgpr_write_b32 a11, s47 +; SDAG-NEXT: v_accvgpr_write_b32 a12, s48 +; SDAG-NEXT: v_accvgpr_write_b32 a13, s49 +; SDAG-NEXT: v_accvgpr_write_b32 a14, s50 +; SDAG-NEXT: v_accvgpr_write_b32 a15, s51 +; SDAG-NEXT: v_mov_b32_e32 v17, s1 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v17 op_sel_hi:[0,0,0] blgp:2 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: global_store_dwordx4 v16, a[12:15], s[2:3] offset:48 +; SDAG-NEXT: global_store_dwordx4 v16, a[8:11], s[2:3] offset:32 +; SDAG-NEXT: global_store_dwordx4 v16, a[4:7], s[2:3] offset:16 +; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[2:3] +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 +; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s36 +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: v_accvgpr_write_b32 a1, s37 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s38 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s39 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s40 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s41 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s42 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s43 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s44 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s45 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s46 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s47 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s48 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s49 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s50 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s51 +; GISEL-NEXT: v_mov_b32_e32 v16, s1 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel_hi:[0,0,0] blgp:2 +; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3] +; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[2:3] offset:16 +; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[2:3] offset:32 +; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[2:3] offset:48 +; GISEL-NEXT: s_endpgm + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1) + store <16 x float> %result, ptr addrspace(1) %ptr, align 64 + ret void +} + +define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, ptr addrspace(1) %ptr) #0 { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 +; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 +; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80 +; SDAG-NEXT: s_movk_i32 s2, 0x41 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, s36 +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: v_mov_b32_e32 v4, s12 +; SDAG-NEXT: v_mov_b32_e32 v5, s13 +; SDAG-NEXT: v_mov_b32_e32 v6, s14 +; SDAG-NEXT: v_mov_b32_e32 v7, s15 +; SDAG-NEXT: v_mov_b32_e32 v8, s16 +; SDAG-NEXT: v_mov_b32_e32 v9, s17 +; SDAG-NEXT: v_mov_b32_e32 v10, s18 +; SDAG-NEXT: v_mov_b32_e32 v11, s19 +; SDAG-NEXT: v_mov_b32_e32 v12, s20 +; SDAG-NEXT: v_mov_b32_e32 v13, s21 +; SDAG-NEXT: v_mov_b32_e32 v14, s22 +; SDAG-NEXT: v_mov_b32_e32 v15, s23 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s37 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s38 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s39 +; SDAG-NEXT: v_accvgpr_write_b32 a4, s40 +; SDAG-NEXT: v_accvgpr_write_b32 a5, s41 +; SDAG-NEXT: v_accvgpr_write_b32 a6, s42 +; SDAG-NEXT: v_accvgpr_write_b32 a7, s43 +; SDAG-NEXT: v_accvgpr_write_b32 a8, s44 +; SDAG-NEXT: v_accvgpr_write_b32 a9, s45 +; SDAG-NEXT: v_accvgpr_write_b32 a10, s46 +; SDAG-NEXT: v_accvgpr_write_b32 a11, s47 +; SDAG-NEXT: v_accvgpr_write_b32 a12, s48 +; SDAG-NEXT: v_accvgpr_write_b32 a13, s49 +; SDAG-NEXT: v_accvgpr_write_b32 a14, s50 +; SDAG-NEXT: v_accvgpr_write_b32 a15, s51 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s2, -2 op_sel_hi:[0,0,0] blgp:2 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: global_store_dwordx4 v16, a[12:15], s[0:1] offset:48 +; SDAG-NEXT: global_store_dwordx4 v16, a[8:11], s[0:1] offset:32 +; SDAG-NEXT: global_store_dwordx4 v16, a[4:7], s[0:1] offset:16 +; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[0:1] +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 +; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80 +; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s36 +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: v_accvgpr_write_b32 a1, s37 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s38 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s39 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s40 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s41 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s42 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s43 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s44 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s45 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s46 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s47 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s48 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s49 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s50 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s51 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, -2 op_sel_hi:[0,0,0] blgp:2 +; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GISEL-NEXT: s_endpgm + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 65, i32 1, i32 -2) + store <16 x float> %result, ptr addrspace(1) %ptr, align 64 + ret void +} + +define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) #1 { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v0, s12 +; SDAG-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-NEXT: v_mov_b32_e32 v2, s14 +; SDAG-NEXT: v_mov_b32_e32 v3, s15 +; SDAG-NEXT: v_mov_b32_e32 v4, s16 +; SDAG-NEXT: v_mov_b32_e32 v5, s17 +; SDAG-NEXT: v_mov_b32_e32 v6, s18 +; SDAG-NEXT: v_mov_b32_e32 v7, s19 +; SDAG-NEXT: v_mov_b32_e32 v8, s20 +; SDAG-NEXT: v_mov_b32_e32 v9, s21 +; SDAG-NEXT: v_mov_b32_e32 v10, s22 +; SDAG-NEXT: v_mov_b32_e32 v11, s23 +; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80 +; SDAG-NEXT: v_mov_b32_e32 v12, s24 +; SDAG-NEXT: v_mov_b32_e32 v13, s25 +; SDAG-NEXT: v_mov_b32_e32 v14, s26 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 +; SDAG-NEXT: v_mov_b32_e32 v15, s27 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 +; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 +; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 +; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 +; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 +; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 +; SDAG-NEXT: v_mov_b32_e32 v16, s1 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mov_b32_e32 v0, s20 +; SDAG-NEXT: v_mov_b32_e32 v1, s21 +; SDAG-NEXT: v_mov_b32_e32 v2, s22 +; SDAG-NEXT: v_mov_b32_e32 v3, s23 +; SDAG-NEXT: v_mov_b64_e32 v[4:5], 48 +; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: v_mov_b64_e32 v[6:7], 32 +; SDAG-NEXT: v_mov_b64_e32 v[8:9], 16 +; SDAG-NEXT: v_mov_b32_e32 v0, s16 +; SDAG-NEXT: v_mov_b32_e32 v1, s17 +; SDAG-NEXT: v_mov_b32_e32 v2, s18 +; SDAG-NEXT: v_mov_b32_e32 v3, s19 +; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: v_mov_b64_e32 v[10:11], 0 +; SDAG-NEXT: v_mov_b32_e32 v0, s12 +; SDAG-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-NEXT: v_mov_b32_e32 v2, s14 +; SDAG-NEXT: v_mov_b32_e32 v3, s15 +; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[6:7], a[8:11], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[4:5], a[12:15], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[10:11], a[0:3], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[8:9], a[4:7], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 +; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] +; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: v_mov_b32_e32 v16, s1 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], 0 +; GISEL-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[6:7], 16 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], 32 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] +; GISEL-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[10:11], 48 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19] +; GISEL-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23] +; GISEL-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[4:5], a[0:3], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[6:7], a[4:7], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[8:9], a[8:11], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[10:11], a[12:15], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_endpgm + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64 + store volatile <16 x float> %result, ptr addrspace(1) null, align 64 + ret void +} + +define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) #1 { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v0, s12 +; SDAG-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-NEXT: v_mov_b32_e32 v2, s14 +; SDAG-NEXT: v_mov_b32_e32 v3, s15 +; SDAG-NEXT: v_mov_b32_e32 v4, s16 +; SDAG-NEXT: v_mov_b32_e32 v5, s17 +; SDAG-NEXT: v_mov_b32_e32 v6, s18 +; SDAG-NEXT: v_mov_b32_e32 v7, s19 +; SDAG-NEXT: v_mov_b32_e32 v8, s20 +; SDAG-NEXT: v_mov_b32_e32 v9, s21 +; SDAG-NEXT: v_mov_b32_e32 v10, s22 +; SDAG-NEXT: v_mov_b32_e32 v11, s23 +; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; SDAG-NEXT: v_mov_b32_e32 v12, s24 +; SDAG-NEXT: v_mov_b32_e32 v13, s25 +; SDAG-NEXT: v_mov_b32_e32 v14, s26 +; SDAG-NEXT: v_mov_b32_e32 v15, s27 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 +; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 +; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 +; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 +; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 +; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2 +; SDAG-NEXT: v_mov_b32_e32 v0, s20 +; SDAG-NEXT: v_mov_b32_e32 v1, s21 +; SDAG-NEXT: v_mov_b32_e32 v2, s22 +; SDAG-NEXT: v_mov_b32_e32 v3, s23 +; SDAG-NEXT: v_mov_b64_e32 v[4:5], 48 +; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: v_mov_b64_e32 v[6:7], 32 +; SDAG-NEXT: v_mov_b64_e32 v[8:9], 16 +; SDAG-NEXT: v_mov_b32_e32 v0, s16 +; SDAG-NEXT: v_mov_b32_e32 v1, s17 +; SDAG-NEXT: v_mov_b32_e32 v2, s18 +; SDAG-NEXT: v_mov_b32_e32 v3, s19 +; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: v_mov_b64_e32 v[10:11], 0 +; SDAG-NEXT: v_mov_b32_e32 v0, s12 +; SDAG-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-NEXT: v_mov_b32_e32 v2, s14 +; SDAG-NEXT: v_mov_b32_e32 v3, s15 +; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[6:7], a[8:11], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[4:5], a[12:15], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[10:11], a[0:3], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[8:9], a[4:7], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 +; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] +; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], 0 +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], 16 +; GISEL-NEXT: global_store_dwordx4 v[4:5], v[16:19], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[8:9], 32 +; GISEL-NEXT: v_mov_b64_e32 v[10:11], 48 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19] +; GISEL-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23] +; GISEL-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[4:5], a[0:3], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[6:7], a[4:7], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[8:9], a[8:11], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[10:11], a[12:15], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_endpgm + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42) + store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64 + store volatile <16 x float> %result, ptr addrspace(1) null, align 64 + ret void +} + +define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) #0 { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonmac: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v0, s12 +; SDAG-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-NEXT: v_mov_b32_e32 v2, s14 +; SDAG-NEXT: v_mov_b32_e32 v3, s15 +; SDAG-NEXT: v_mov_b32_e32 v4, s16 +; SDAG-NEXT: v_mov_b32_e32 v5, s17 +; SDAG-NEXT: v_mov_b32_e32 v6, s18 +; SDAG-NEXT: v_mov_b32_e32 v7, s19 +; SDAG-NEXT: v_mov_b32_e32 v8, s20 +; SDAG-NEXT: v_mov_b32_e32 v9, s21 +; SDAG-NEXT: v_mov_b32_e32 v10, s22 +; SDAG-NEXT: v_mov_b32_e32 v11, s23 +; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; SDAG-NEXT: v_mov_b32_e32 v12, s24 +; SDAG-NEXT: v_mov_b32_e32 v13, s25 +; SDAG-NEXT: v_mov_b32_e32 v14, s26 +; SDAG-NEXT: v_mov_b32_e32 v15, s27 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 +; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 +; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 +; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 +; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 +; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 0, 0 op_sel_hi:[0,0,0] blgp:2 +; SDAG-NEXT: v_mov_b32_e32 v0, s20 +; SDAG-NEXT: v_mov_b32_e32 v1, s21 +; SDAG-NEXT: v_mov_b32_e32 v2, s22 +; SDAG-NEXT: v_mov_b32_e32 v3, s23 +; SDAG-NEXT: v_mov_b64_e32 v[4:5], 48 +; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: v_mov_b64_e32 v[6:7], 32 +; SDAG-NEXT: v_mov_b64_e32 v[8:9], 16 +; SDAG-NEXT: v_mov_b32_e32 v0, s16 +; SDAG-NEXT: v_mov_b32_e32 v1, s17 +; SDAG-NEXT: v_mov_b32_e32 v2, s18 +; SDAG-NEXT: v_mov_b32_e32 v3, s19 +; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: v_mov_b64_e32 v[10:11], 0 +; SDAG-NEXT: v_mov_b32_e32 v0, s12 +; SDAG-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-NEXT: v_mov_b32_e32 v2, s14 +; SDAG-NEXT: v_mov_b32_e32 v3, s15 +; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[6:7], a[8:11], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[4:5], a[12:15], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[10:11], a[0:3], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[8:9], a[4:7], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonmac: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 +; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] +; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 0, 0 op_sel_hi:[0,0,0] blgp:2 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], 0 +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], 16 +; GISEL-NEXT: global_store_dwordx4 v[4:5], v[16:19], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[8:9], 32 +; GISEL-NEXT: v_mov_b64_e32 v[10:11], 48 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19] +; GISEL-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23] +; GISEL-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[4:5], a[0:3], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[6:7], a[4:7], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[8:9], a[8:11], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[10:11], a[12:15], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_endpgm + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0) + store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64 + store volatile <16 x float> %result, ptr addrspace(1) null, align 64 + ret void +} + +define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) #0 { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v0, s12 +; SDAG-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-NEXT: v_mov_b32_e32 v2, s14 +; SDAG-NEXT: v_mov_b32_e32 v3, s15 +; SDAG-NEXT: v_mov_b32_e32 v4, s16 +; SDAG-NEXT: v_mov_b32_e32 v5, s17 +; SDAG-NEXT: v_mov_b32_e32 v6, s18 +; SDAG-NEXT: v_mov_b32_e32 v7, s19 +; SDAG-NEXT: v_mov_b32_e32 v8, s20 +; SDAG-NEXT: v_mov_b32_e32 v9, s21 +; SDAG-NEXT: v_mov_b32_e32 v10, s22 +; SDAG-NEXT: v_mov_b32_e32 v11, s23 +; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; SDAG-NEXT: v_mov_b32_e32 v12, s24 +; SDAG-NEXT: v_mov_b32_e32 v13, s25 +; SDAG-NEXT: v_mov_b32_e32 v14, s26 +; SDAG-NEXT: v_mov_b32_e32 v15, s27 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 +; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 +; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 +; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 +; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 +; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2 +; SDAG-NEXT: v_mov_b32_e32 v0, s20 +; SDAG-NEXT: v_mov_b32_e32 v1, s21 +; SDAG-NEXT: v_mov_b32_e32 v2, s22 +; SDAG-NEXT: v_mov_b32_e32 v3, s23 +; SDAG-NEXT: v_mov_b64_e32 v[4:5], 48 +; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: v_mov_b64_e32 v[6:7], 32 +; SDAG-NEXT: v_mov_b64_e32 v[8:9], 16 +; SDAG-NEXT: v_mov_b32_e32 v0, s16 +; SDAG-NEXT: v_mov_b32_e32 v1, s17 +; SDAG-NEXT: v_mov_b32_e32 v2, s18 +; SDAG-NEXT: v_mov_b32_e32 v3, s19 +; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: v_mov_b64_e32 v[10:11], 0 +; SDAG-NEXT: v_mov_b32_e32 v0, s12 +; SDAG-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-NEXT: v_mov_b32_e32 v2, s14 +; SDAG-NEXT: v_mov_b32_e32 v3, s15 +; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[6:7], a[8:11], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[4:5], a[12:15], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[10:11], a[0:3], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[8:9], a[4:7], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 +; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] +; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], 0 +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], 16 +; GISEL-NEXT: global_store_dwordx4 v[4:5], v[16:19], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[8:9], 32 +; GISEL-NEXT: v_mov_b64_e32 v[10:11], 48 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19] +; GISEL-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23] +; GISEL-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[4:5], a[0:3], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[6:7], a[4:7], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[8:9], a[8:11], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[10:11], a[12:15], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_endpgm + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42) + store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64 + store volatile <16 x float> %result, ptr addrspace(1) null, align 64 + ret void +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_a: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword a15, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: v_accvgpr_write_b32 a4, v20 +; GCN-NEXT: v_accvgpr_write_b32 a5, v21 +; GCN-NEXT: v_accvgpr_write_b32 a6, v22 +; GCN-NEXT: v_accvgpr_write_b32 a7, v23 +; GCN-NEXT: v_accvgpr_write_b32 a8, v24 +; GCN-NEXT: v_accvgpr_write_b32 a9, v25 +; GCN-NEXT: v_accvgpr_write_b32 a10, v26 +; GCN-NEXT: v_accvgpr_write_b32 a11, v27 +; GCN-NEXT: v_accvgpr_write_b32 a12, v28 +; GCN-NEXT: v_accvgpr_write_b32 a13, v29 +; GCN-NEXT: v_accvgpr_write_b32 a14, v30 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 0, 0 op_sel_hi:[0,0,0] +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword a15, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: v_accvgpr_write_b32 a4, v20 +; GCN-NEXT: v_accvgpr_write_b32 a5, v21 +; GCN-NEXT: v_accvgpr_write_b32 a6, v22 +; GCN-NEXT: v_accvgpr_write_b32 a7, v23 +; GCN-NEXT: v_accvgpr_write_b32 a8, v24 +; GCN-NEXT: v_accvgpr_write_b32 a9, v25 +; GCN-NEXT: v_accvgpr_write_b32 a10, v26 +; GCN-NEXT: v_accvgpr_write_b32 a11, v27 +; GCN-NEXT: v_accvgpr_write_b32 a12, v28 +; GCN-NEXT: v_accvgpr_write_b32 a13, v29 +; GCN-NEXT: v_accvgpr_write_b32 a14, v30 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 0, 0 op_sel_hi:[0,0,0] +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 3, i32 0, i32 1, i32 0) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword a15, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: v_accvgpr_write_b32 a4, v20 +; GCN-NEXT: v_accvgpr_write_b32 a5, v21 +; GCN-NEXT: v_accvgpr_write_b32 a6, v22 +; GCN-NEXT: v_accvgpr_write_b32 a7, v23 +; GCN-NEXT: v_accvgpr_write_b32 a8, v24 +; GCN-NEXT: v_accvgpr_write_b32 a9, v25 +; GCN-NEXT: v_accvgpr_write_b32 a10, v26 +; GCN-NEXT: v_accvgpr_write_b32 a11, v27 +; GCN-NEXT: v_accvgpr_write_b32 a12, v28 +; GCN-NEXT: v_accvgpr_write_b32 a13, v29 +; GCN-NEXT: v_accvgpr_write_b32 a14, v30 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 0, 1 op_sel_hi:[0,0,0] +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword a15, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: v_accvgpr_write_b32 a4, v20 +; GCN-NEXT: v_accvgpr_write_b32 a5, v21 +; GCN-NEXT: v_accvgpr_write_b32 a6, v22 +; GCN-NEXT: v_accvgpr_write_b32 a7, v23 +; GCN-NEXT: v_accvgpr_write_b32 a8, v24 +; GCN-NEXT: v_accvgpr_write_b32 a9, v25 +; GCN-NEXT: v_accvgpr_write_b32 a10, v26 +; GCN-NEXT: v_accvgpr_write_b32 a11, v27 +; GCN-NEXT: v_accvgpr_write_b32 a12, v28 +; GCN-NEXT: v_accvgpr_write_b32 a13, v29 +; GCN-NEXT: v_accvgpr_write_b32 a14, v30 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 1, 0 op_sel_hi:[0,0,0] +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0) + ret <16 x float> %result +} + +; -------------------------------------------------------------------- +; Incorrect signature for format cases (IR vector too large) +; -------------------------------------------------------------------- + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:2 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:2 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 2, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:2 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:2 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:2 blgp:2 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:2 blgp:2 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 2, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword a15, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: v_accvgpr_write_b32 a4, v20 +; GCN-NEXT: v_accvgpr_write_b32 a5, v21 +; GCN-NEXT: v_accvgpr_write_b32 a6, v22 +; GCN-NEXT: v_accvgpr_write_b32 a7, v23 +; GCN-NEXT: v_accvgpr_write_b32 a8, v24 +; GCN-NEXT: v_accvgpr_write_b32 a9, v25 +; GCN-NEXT: v_accvgpr_write_b32 a10, v26 +; GCN-NEXT: v_accvgpr_write_b32 a11, v27 +; GCN-NEXT: v_accvgpr_write_b32 a12, v28 +; GCN-NEXT: v_accvgpr_write_b32 a13, v29 +; GCN-NEXT: v_accvgpr_write_b32 a14, v30 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:2 blgp:2 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 2, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:4 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:4 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 4, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:4 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:4 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword v31, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: v_accvgpr_write_b32 a4, v18 +; GCN-NEXT: v_accvgpr_write_b32 a5, v19 +; GCN-NEXT: v_accvgpr_write_b32 a6, v20 +; GCN-NEXT: v_accvgpr_write_b32 a7, v21 +; GCN-NEXT: v_accvgpr_write_b32 a8, v22 +; GCN-NEXT: v_accvgpr_write_b32 a9, v23 +; GCN-NEXT: v_accvgpr_write_b32 a10, v24 +; GCN-NEXT: v_accvgpr_write_b32 a11, v25 +; GCN-NEXT: v_accvgpr_write_b32 a12, v26 +; GCN-NEXT: v_accvgpr_write_b32 a13, v27 +; GCN-NEXT: v_accvgpr_write_b32 a14, v28 +; GCN-NEXT: v_accvgpr_write_b32 a15, v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:4 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 4, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword v31, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v14 +; GCN-NEXT: v_accvgpr_write_b32 a1, v15 +; GCN-NEXT: v_accvgpr_write_b32 a2, v16 +; GCN-NEXT: v_accvgpr_write_b32 a3, v17 +; GCN-NEXT: v_accvgpr_write_b32 a4, v18 +; GCN-NEXT: v_accvgpr_write_b32 a5, v19 +; GCN-NEXT: v_accvgpr_write_b32 a6, v20 +; GCN-NEXT: v_accvgpr_write_b32 a7, v21 +; GCN-NEXT: v_accvgpr_write_b32 a8, v22 +; GCN-NEXT: v_accvgpr_write_b32 a9, v23 +; GCN-NEXT: v_accvgpr_write_b32 a10, v24 +; GCN-NEXT: v_accvgpr_write_b32 a11, v25 +; GCN-NEXT: v_accvgpr_write_b32 a12, v26 +; GCN-NEXT: v_accvgpr_write_b32 a13, v27 +; GCN-NEXT: v_accvgpr_write_b32 a14, v28 +; GCN-NEXT: v_accvgpr_write_b32 a15, v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:4 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:4 blgp:4 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:4 blgp:4 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 4, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: scratch_load_dword a15, off, s32 +; GCN-NEXT: v_accvgpr_write_b32 a0, v16 +; GCN-NEXT: v_accvgpr_write_b32 a1, v17 +; GCN-NEXT: v_accvgpr_write_b32 a2, v18 +; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: v_accvgpr_write_b32 a4, v20 +; GCN-NEXT: v_accvgpr_write_b32 a5, v21 +; GCN-NEXT: v_accvgpr_write_b32 a6, v22 +; GCN-NEXT: v_accvgpr_write_b32 a7, v23 +; GCN-NEXT: v_accvgpr_write_b32 a8, v24 +; GCN-NEXT: v_accvgpr_write_b32 a9, v25 +; GCN-NEXT: v_accvgpr_write_b32 a10, v26 +; GCN-NEXT: v_accvgpr_write_b32 a11, v27 +; GCN-NEXT: v_accvgpr_write_b32 a12, v28 +; GCN-NEXT: v_accvgpr_write_b32 a13, v29 +; GCN-NEXT: v_accvgpr_write_b32 a14, v30 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:4 blgp:4 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 4, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32>, <8 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 +declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32>, <6 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 +declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32>, <4 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 +declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32>, <6 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 +declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32>, <8 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 +declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32>, <4 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 +declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32>, <8 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 +declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 +declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 + +attributes #0 = { "amdgpu-flat-work-group-size"="512,512" } +attributes #1 = { "amdgpu-flat-work-group-size"="128,128" } +attributes #2 = { convergent nocallback nofree nosync nounwind willreturn memory(none) } diff --git a/llvm/test/MC/AMDGPU/mai-gfx950.s b/llvm/test/MC/AMDGPU/mai-gfx950.s index a692693638c69..07d930b0b64df 100644 --- a/llvm/test/MC/AMDGPU/mai-gfx950.s +++ b/llvm/test/MC/AMDGPU/mai-gfx950.s @@ -2,6 +2,13 @@ // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx940 %s 2>&1 | FileCheck -check-prefix=ERR %s // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx90a %s 2>&1 | FileCheck -check-prefix=ERR %s +// cbsz = SrcA +// blgp = SrcB + +// 0 = fp8. 1 = bf8 +// 2 = fp6, 3 = bf6 +// 4 = fp4 + //===----------------------------------------------------------------------===// // MFMA opcodes. //===----------------------------------------------------------------------===// @@ -275,3 +282,612 @@ v_mfma_ld_scale_b32 v1, v1 op_sel:[1,0] op_sel_hi:[0,1] // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU v_mfma_ld_scale_b32 v1, v1 op_sel:[0,1] op_sel_hi:[1,0] + +//===----------------------------------------------------------------------===// +// v_mfma_f32_16x16x128_f8f6f4 +//===----------------------------------------------------------------------===// + +// GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] ; encoding: [0x00,0x00,0xad,0xd3,0x04,0x09,0x02,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] + +// GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] blgp:1 ; encoding: [0x00,0x00,0xad,0xd3,0x04,0x09,0x02,0x24] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] blgp:1 + +// GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[4:11], v[0:3] cbsz:3 ; encoding: [0x00,0x03,0xad,0xd3,0x04,0x09,0x02,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[4:11], v[0:3] cbsz:3 + +// GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] ; encoding: [0x00,0x00,0xad,0xd3,0x04,0x09,0x02,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] + +// GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[4:11], v[0:3] cbsz:3 blgp:1 ; encoding: [0x00,0x03,0xad,0xd3,0x04,0x09,0x02,0x24] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[4:11], v[0:3] cbsz:3 blgp:1 + +// GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[4:11], a[0:3] ; encoding: [0x00,0x80,0xad,0xd3,0x04,0x09,0x02,0x1c] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[4:11], a[0:3] + +// GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[4:11], a[0:3] blgp:1 ; encoding: [0x00,0x80,0xad,0xd3,0x04,0x09,0x02,0x3c] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[4:11], a[0:3] blgp:1 + +// GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[4:11], a[0:3] cbsz:3 ; encoding: [0x00,0x83,0xad,0xd3,0x04,0x09,0x02,0x1c] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[4:11], a[0:3] cbsz:3 + +// GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[4:11], a[0:3] ; encoding: [0x00,0x80,0xad,0xd3,0x04,0x09,0x02,0x1c] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[4:11], a[0:3] + +// GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[4:9], a[0:3] cbsz:1 blgp:3 ; encoding: [0x00,0x81,0xad,0xd3,0x04,0x09,0x02,0x7c] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[4:9], a[0:3] cbsz:1 blgp:3 + +//===----------------------------------------------------------------------===// +// v_mfma_f32_32x32x64_f8f6f4 +//===----------------------------------------------------------------------===// + +// GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] ; encoding: [0x00,0x00,0xae,0xd3,0x04,0x09,0x02,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] + +// GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] blgp:1 ; encoding: [0x00,0x00,0xae,0xd3,0x04,0x09,0x02,0x24] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] blgp:1 + +// GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[4:11], v[0:15] cbsz:3 ; encoding: [0x00,0x03,0xae,0xd3,0x04,0x09,0x02,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[4:11], v[0:15] cbsz:3 + +// GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] ; encoding: [0x00,0x00,0xae,0xd3,0x04,0x09,0x02,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] + +// GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[4:11], v[0:15] cbsz:3 blgp:1 ; encoding: [0x00,0x03,0xae,0xd3,0x04,0x09,0x02,0x24] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[4:11], v[0:15] cbsz:3 blgp:1 + +// GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:11], a[4:11], a[0:15] ; encoding: [0x00,0x80,0xae,0xd3,0x04,0x09,0x02,0x1c] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:11], a[4:11], a[0:15] + +// GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:11], a[4:11], a[0:15] blgp:1 ; encoding: [0x00,0x80,0xae,0xd3,0x04,0x09,0x02,0x3c] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:11], a[4:11], a[0:15] blgp:1 + +// GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:9], a[4:11], a[0:15] cbsz:3 ; encoding: [0x00,0x83,0xae,0xd3,0x04,0x09,0x02,0x1c] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:9], a[4:11], a[0:15] cbsz:3 + +// GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:11], a[4:11], a[0:15] ; encoding: [0x00,0x80,0xae,0xd3,0x04,0x09,0x02,0x1c] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:11], a[4:11], a[0:15] + +// GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:11], a[4:9], a[0:15] cbsz:1 blgp:3 ; encoding: [0x00,0x81,0xae,0xd3,0x04,0x09,0x02,0x7c] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:11], a[4:9], a[0:15] cbsz:1 blgp:3 + +//===----------------------------------------------------------------------===// +// v_mfma_scale_f32_16x16x128_f8f6f4 +//===----------------------------------------------------------------------===// +// FIXME: Test op_sel, neg, clamp + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:19], a[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x88,0xad,0xd3,0x04,0x19,0x52,0x1c] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:19], a[20:23], v24, v25 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[4:11], v[12:19], a[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x88,0xad,0xd3,0x04,0x19,0x52,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[4:11], v[12:19], a[20:23], v24, v25 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], a[4:11], a[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x1c] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], a[4:11], a[12:19], v[20:23], v24, v25 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], a[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x14] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], a[12:19], v[20:23], v24, v25 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], a[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x0c] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], a[4:11], v[12:19], v[20:23], v24, v25 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[50:53], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x32,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_16x16x128_f8f6f4 v[50:53], v[4:11], v[12:19], v[20:23], v24, v25 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v44, s24 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x2c,0x31,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v44, s24 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s24, s24 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x30,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s24, s24 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s24, v44 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x58,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s24, v44 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], m0, m0 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x7c,0xf8,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], m0, m0 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], vcc_lo, v2 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x6a,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], vcc_lo, v2 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 9, v2 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x89,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 9, v2 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v2, 9 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x02,0x13,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v2, 9 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s20, 9 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x14,0x12,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s20, 9 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 33, 9 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xa1,0x12,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 33, 9 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 cbsz:3 blgp:1 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x08,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] cbsz:3 blgp:1 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[0,1,0] cbsz:3 blgp:1 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:17], v[20:23], v24, v25 op_sel_hi:[0,0,0] blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x44] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:17], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[0,1,0] blgp:2 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[0,1,0] + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[0,1,0] cbsz:3 + +//===----------------------------------------------------------------------===// +// v_mfma_scale_f32_32x32x64_f8f6f4 +//===----------------------------------------------------------------------===// +// FIXME: Test op_sel, neg, clamp + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:23], a[24:31], a[32:47], v48, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x88,0xae,0xd3,0x10,0x31,0x82,0x1c] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:23], a[24:31], a[32:47], v48, v49 + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[24:31], a[32:47], v48, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x88,0xae,0xd3,0x10,0x31,0x82,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[24:31], a[32:47], v48, v49 + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], a[16:23], a[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x1c] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], a[16:23], a[24:31], v[32:47], v48, v49 + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], a[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x14] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], a[24:31], v[32:47], v48, v49 + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:23], v[24:31], a[32:47], v48, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x88,0xae,0xd3,0x10,0x31,0x82,0x0c] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:23], v[24:31], a[32:47], v48, v49 + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[50:65], v[16:23], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x32,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_32x32x64_f8f6f4 v[50:65], v[16:23], v[24:31], v[32:47], v48, v49 + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0b,0xae,0xd3,0x10,0x31,0x82,0x24] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:31], v[32:47], v48, v49 cbsz:3 blgp:1 + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:2 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0a,0xae,0xd3,0x10,0x31,0x82,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:31], v[32:47], v48, v49 cbsz:2 + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:29], v[32:47], v48, v49 op_sel_hi:[0,0,0] blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x44] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:29], v[32:47], v48, v49 blgp:2 + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:2 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x18,0x00,0x0a,0xae,0xd3,0x10,0x31,0x82,0x64] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 op_sel:[0,1,0] op_sel_hi:[0,1,0] cbsz:2 blgp:3 + +//===----------------------------------------------------------------------===// +// v_mfma_f32_16x16x128_f8f6f4 with appropriate register widths +//===----------------------------------------------------------------------===// + +// bf8 x fp4 +// GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:15], v[20:23] cbsz:1 blgp:4 ; encoding: [0x00,0x01,0xad,0xd3,0x04,0x19,0x52,0x84] +v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:15], v[20:23] cbsz:1 blgp:4 + +// GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:15], v[20:23] cbsz:1 blgp:4 ; encoding: [0x00,0x01,0xad,0xd3,0x04,0x19,0x52,0x84] +v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:15], v[20:23] cbsz:1 blgp:4 + +// GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:15], a[20:23] cbsz:1 blgp:4 ; encoding: [0x00,0x81,0xad,0xd3,0x04,0x19,0x52,0x9c] +v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:15], a[20:23] cbsz:1 blgp:4 + + +// fp4 x bf8 +// GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:19], v[20:23] cbsz:4 blgp:1 ; encoding: [0x00,0x04,0xad,0xd3,0x04,0x19,0x52,0x24] +v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:19], v[20:23] cbsz:4 blgp:1 + +// GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:19], a[20:23] cbsz:4 blgp:1 ; encoding: [0x00,0x84,0xad,0xd3,0x04,0x19,0x52,0x3c] +v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:19], a[20:23] cbsz:4 blgp:1 + + +// bf6 x bf8 +// GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23] cbsz:3 blgp:1 ; encoding: [0x00,0x03,0xad,0xd3,0x04,0x19,0x52,0x24] +v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23] cbsz:3 blgp:1 + +// GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:19], a[20:23] cbsz:3 blgp:1 ; encoding: [0x00,0x83,0xad,0xd3,0x04,0x19,0x52,0x3c] +v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:19], a[20:23] cbsz:3 blgp:1 + + +// bf8 x bf6 +// GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:17], v[20:23] cbsz:1 blgp:3 ; encoding: [0x00,0x01,0xad,0xd3,0x04,0x19,0x52,0x64] +v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:17], v[20:23] cbsz:1 blgp:3 + +// GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:17], a[20:23] cbsz:1 blgp:3 ; encoding: [0x00,0x81,0xad,0xd3,0x04,0x19,0x52,0x7c] +v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:17], a[20:23] cbsz:1 blgp:3 + + +// bf6 x bf6 +// GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:17], v[20:23] cbsz:3 blgp:3 ; encoding: [0x00,0x03,0xad,0xd3,0x04,0x19,0x52,0x64] +v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:17], v[20:23] cbsz:3 blgp:3 + +// GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:17], a[20:23] cbsz:3 blgp:3 ; encoding: [0x00,0x83,0xad,0xd3,0x04,0x19,0x52,0x7c] +v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:17], a[20:23] cbsz:3 blgp:3 + +// bf6 x fp6 +// GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:17], v[20:23] cbsz:3 blgp:2 ; encoding: [0x00,0x03,0xad,0xd3,0x04,0x19,0x52,0x44] +v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:17], v[20:23] cbsz:3 blgp:2 + +// GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:17], a[20:23] cbsz:3 blgp:2 ; encoding: [0x00,0x83,0xad,0xd3,0x04,0x19,0x52,0x5c] +v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:17], a[20:23] cbsz:3 blgp:2 + + +// fp6 x bf6 +// GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:17], v[20:23] cbsz:2 blgp:3 ; encoding: [0x00,0x02,0xad,0xd3,0x04,0x19,0x52,0x64] +v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:17], v[20:23] cbsz:2 blgp:3 + +// GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:17], a[20:23] cbsz:2 blgp:3 ; encoding: [0x00,0x82,0xad,0xd3,0x04,0x19,0x52,0x7c] +v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:17], a[20:23] cbsz:2 blgp:3 + + +// fp6 x fp4 +// GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:15], v[20:23] cbsz:2 blgp:4 ; encoding: [0x00,0x02,0xad,0xd3,0x04,0x19,0x52,0x84] +v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:15], v[20:23] cbsz:2 blgp:4 + +// GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:15], a[20:23] cbsz:2 blgp:4 ; encoding: [0x00,0x82,0xad,0xd3,0x04,0x19,0x52,0x9c] +v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:15], a[20:23] cbsz:2 blgp:4 + + +// fp4 x fp6 +// GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:17], v[20:23] cbsz:4 blgp:2 ; encoding: [0x00,0x04,0xad,0xd3,0x04,0x19,0x52,0x44] +v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:17], v[20:23] cbsz:4 blgp:2 + +// GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:17], a[20:23] cbsz:4 blgp:2 ; encoding: [0x00,0x84,0xad,0xd3,0x04,0x19,0x52,0x5c] +v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:17], a[20:23] cbsz:4 blgp:2 + + +// fp4 x fp4 +// GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:15], v[20:23] cbsz:4 blgp:4 ; encoding: [0x00,0x04,0xad,0xd3,0x04,0x19,0x52,0x84] +v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:15], v[20:23] cbsz:4 blgp:4 + +// GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:15], a[20:23] cbsz:4 blgp:4 ; encoding: [0x00,0x84,0xad,0xd3,0x04,0x19,0x52,0x9c] +v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:15], a[20:23] cbsz:4 blgp:4 + +//===----------------------------------------------------------------------===// +// v_mfma_f32_32x32x64_f8f6f4 with appropriate register widths +//===----------------------------------------------------------------------===// + +// bf8 x fp4 +// GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[12:15], v[16:31] cbsz:1 blgp:4 ; encoding: [0x00,0x01,0xae,0xd3,0x04,0x19,0x42,0x84] +v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[12:15], v[16:31] cbsz:1 blgp:4 + +// GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:11], a[12:15], a[16:31] cbsz:1 blgp:4 ; encoding: [0x00,0x81,0xae,0xd3,0x04,0x19,0x42,0x9c] +v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:11], a[12:15], a[16:31] cbsz:1 blgp:4 + + +// fp4 x bf8 +// GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:7], v[8:15], v[16:31] cbsz:4 blgp:1 ; encoding: [0x00,0x04,0xae,0xd3,0x04,0x11,0x42,0x24] +v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:7], v[8:15], v[16:31] cbsz:4 blgp:1 + +// GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:7], a[8:15], a[16:31] cbsz:4 blgp:1 ; encoding: [0x00,0x84,0xae,0xd3,0x04,0x11,0x42,0x3c] +v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:7], a[8:15], a[16:31] cbsz:4 blgp:1 + + +// bf6 x bf8 +// GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[12:19], v[16:31] cbsz:3 blgp:1 ; encoding: [0x00,0x03,0xae,0xd3,0x04,0x19,0x42,0x24] +v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[12:19], v[16:31] cbsz:3 blgp:1 + +// GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:9], a[12:19], a[16:31] cbsz:3 blgp:1 ; encoding: [0x00,0x83,0xae,0xd3,0x04,0x19,0x42,0x3c] +v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:9], a[12:19], a[16:31] cbsz:3 blgp:1 + +// bf8 x bf6 +// GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[12:17], v[16:31] cbsz:1 blgp:3 ; encoding: [0x00,0x01,0xae,0xd3,0x04,0x19,0x42,0x64] +v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[12:17], v[16:31] cbsz:1 blgp:3 + +// GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:11], a[12:17], a[16:31] cbsz:1 blgp:3 ; encoding: [0x00,0x81,0xae,0xd3,0x04,0x19,0x42,0x7c] +v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:11], a[12:17], a[16:31] cbsz:1 blgp:3 + + +// bf6 x bf6 +// GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[12:17], v[16:31] cbsz:3 blgp:3 ; encoding: [0x00,0x03,0xae,0xd3,0x04,0x19,0x42,0x64] +v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[12:17], v[16:31] cbsz:3 blgp:3 + +// GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:9], a[12:17], a[16:31] cbsz:3 blgp:3 ; encoding: [0x00,0x83,0xae,0xd3,0x04,0x19,0x42,0x7c] +v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:9], a[12:17], a[16:31] cbsz:3 blgp:3 + + +// bf6 x fp6 +// GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[12:17], v[16:31] cbsz:3 blgp:2 ; encoding: [0x00,0x03,0xae,0xd3,0x04,0x19,0x42,0x44] +v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[12:17], v[16:31] cbsz:3 blgp:2 + +// GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:9], a[12:17], a[16:31] cbsz:3 blgp:2 ; encoding: [0x00,0x83,0xae,0xd3,0x04,0x19,0x42,0x5c] +v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:9], a[12:17], a[16:31] cbsz:3 blgp:2 + + +// fp6 x bf6 +// GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[12:17], v[16:31] cbsz:2 blgp:3 ; encoding: [0x00,0x02,0xae,0xd3,0x04,0x19,0x42,0x64] +v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[12:17], v[16:31] cbsz:2 blgp:3 + +// GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:9], a[12:17], a[16:31] cbsz:2 blgp:3 ; encoding: [0x00,0x82,0xae,0xd3,0x04,0x19,0x42,0x7c] +v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:9], a[12:17], a[16:31] cbsz:2 blgp:3 + + +// fp6 x fp4 +// GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[12:15], v[16:31] cbsz:2 blgp:4 ; encoding: [0x00,0x02,0xae,0xd3,0x04,0x19,0x42,0x84] +v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[12:15], v[16:31] cbsz:2 blgp:4 + +// GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:9], a[12:15], a[16:31] cbsz:2 blgp:4 ; encoding: [0x00,0x82,0xae,0xd3,0x04,0x19,0x42,0x9c] +v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:9], a[12:15], a[16:31] cbsz:2 blgp:4 + + +// fp4 x fp6 +// GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:7], v[12:17], v[16:31] cbsz:4 blgp:2 ; encoding: [0x00,0x04,0xae,0xd3,0x04,0x19,0x42,0x44] +v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:7], v[12:17], v[16:31] cbsz:4 blgp:2 + +// GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:7], a[12:17], a[16:31] cbsz:4 blgp:2 ; encoding: [0x00,0x84,0xae,0xd3,0x04,0x19,0x42,0x5c] +v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:7], a[12:17], a[16:31] cbsz:4 blgp:2 + + +// fp4 x fp4 +// GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:7], v[12:15], v[16:31] cbsz:4 blgp:4 ; encoding: [0x00,0x04,0xae,0xd3,0x04,0x19,0x42,0x84] +v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:7], v[12:15], v[16:31] cbsz:4 blgp:4 + +// GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:7], a[12:15], a[16:31] cbsz:4 blgp:4 ; encoding: [0x00,0x84,0xae,0xd3,0x04,0x19,0x42,0x9c] +v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:7], a[12:15], a[16:31] cbsz:4 blgp:4 + +//===----------------------------------------------------------------------===// +// v_mfma_scale_f32_16x16x128_f8f6f4 corrected widths +//===----------------------------------------------------------------------===// + +// fp8 x fp8 +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 cbsz:0 blgp:0 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:19], a[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x88,0xad,0xd3,0x04,0x19,0x52,0x1c] +v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:19], a[20:23], v24, v25 cbsz:0 blgp:0 + + +// bf8 x fp8 +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x09,0xad,0xd3,0x04,0x19,0x52,0x04] +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 cbsz:1 blgp:0 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:19], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x89,0xad,0xd3,0x04,0x19,0x52,0x1c] +v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:19], a[20:23], v24, v25 cbsz:1 blgp:0 + +// fp8 x bf8 +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x24] +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 cbsz:0 blgp:1 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:19], a[20:23], v24, v25 op_sel_hi:[0,0,0] blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x88,0xad,0xd3,0x04,0x19,0x52,0x3c] +v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:19], a[20:23], v24, v25 cbsz:0 blgp:1 + + +// bf8 x fp4 +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:15], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:1 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x09,0xad,0xd3,0x04,0x19,0x52,0x84] +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:15], v[20:23], v24, v25 cbsz:1 blgp:4 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:15], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:1 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x89,0xad,0xd3,0x04,0x19,0x52,0x9c] +v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:15], a[20:23], v24, v25 cbsz:1 blgp:4 + +// fp4 x bf8 +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0c,0xad,0xd3,0x04,0x19,0x52,0x24] +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:19], v[20:23], v24, v25 cbsz:4 blgp:1 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:19], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8c,0xad,0xd3,0x04,0x19,0x52,0x3c] +v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:19], a[20:23], v24, v25 cbsz:4 blgp:1 + + +// bf6 x bf8 +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 cbsz:3 blgp:1 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:19], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8b,0xad,0xd3,0x04,0x19,0x52,0x3c] +v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:19], a[20:23], v24, v25 cbsz:3 blgp:1 + + +// bf8 x bf6 +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:17], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:1 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x09,0xad,0xd3,0x04,0x19,0x52,0x64] +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:17], v[20:23], v24, v25 cbsz:1 blgp:3 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:17], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:1 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x89,0xad,0xd3,0x04,0x19,0x52,0x7c] +v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:17], a[20:23], v24, v25 cbsz:1 blgp:3 + + +// bf6 x bf6 +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:17], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x64] +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:17], v[20:23], v24, v25 cbsz:3 blgp:3 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:17], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8b,0xad,0xd3,0x04,0x19,0x52,0x7c] +v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:17], a[20:23], v24, v25 cbsz:3 blgp:3 + + +// bf6 x fp6 +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:17], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x44] +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:17], v[20:23], v24, v25 cbsz:3 blgp:2 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:17], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8b,0xad,0xd3,0x04,0x19,0x52,0x5c] +v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:17], a[20:23], v24, v25 cbsz:3 blgp:2 + + +// fp6 x bf6 +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:17], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:2 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0a,0xad,0xd3,0x04,0x19,0x52,0x64 +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:17], v[20:23], v24, v25 cbsz:2 blgp:3 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:17], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:2 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8a,0xad,0xd3,0x04,0x19,0x52,0x7c] +v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:17], a[20:23], v24, v25 cbsz:2 blgp:3 + + +// fp6 x fp4 +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:15], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:2 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0a,0xad,0xd3,0x04,0x19,0x52,0x84] +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:15], v[20:23], v24, v25 cbsz:2 blgp:4 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:15], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:2 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8a,0xad,0xd3,0x04,0x19,0x52,0x9c] +v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:15], a[20:23], v24, v25 cbsz:2 blgp:4 + +// fp4 x fp6 +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:17], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0c,0xad,0xd3,0x04,0x19,0x52,0x44] +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:17], v[20:23], v24, v25 cbsz:4 blgp:2 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:17], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8c,0xad,0xd3,0x04,0x19,0x52,0x5c] +v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:17], a[20:23], v24, v25 cbsz:4 blgp:2 + +// fp4 x fp4 +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:15], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0c,0xad,0xd3,0x04,0x19,0x52,0x84] +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:15], v[20:23], v24, v25 cbsz:4 blgp:4 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:15], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8c,0xad,0xd3,0x04,0x19,0x52,0x9c] +v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:15], a[20:23], v24, v25 cbsz:4 blgp:4 + + +//===----------------------------------------------------------------------===// +// v_mfma_scale_f32_32x32x64_f8f6f4 corrected widths +//===----------------------------------------------------------------------===// + +// fp8 x fp8 +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 cbsz:0 blgp:0 + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:23], a[24:31], a[32:47], v48, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x88,0xae,0xd3,0x10,0x31,0x82,0x1c] +v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:23], a[24:31], a[32:47], v48, v49 cbsz:0 blgp:0 + +// bf8 x fp8 +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:1 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x09,0xae,0xd3,0x10,0x31,0x82,0x04] +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 cbsz:1 blgp:0 + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:23], a[24:31], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:1 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x89,0xae,0xd3,0x10,0x31,0x82,0x1c] +v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:23], a[24:31], a[32:47], v48, v49 cbsz:1 blgp:0 + + +// fp8 x bf8 +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x24] +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 cbsz:0 blgp:1 + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:23], a[24:31], a[32:47], v48, v49 op_sel_hi:[0,0,0] blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x88,0xae,0xd3,0x10,0x31,0x82,0x3c] +v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:23], a[24:31], a[32:47], v48, v49 cbsz:0 blgp:1 + +// bf8 x fp4 +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:27], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:1 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x09,0xae,0xd3,0x10,0x31,0x82,0x84] +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:27], v[32:47], v48, v49 cbsz:1 blgp:4 + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:23], a[24:27], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:1 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x89,0xae,0xd3,0x10,0x31,0x82,0x9c] +v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:23], a[24:27], a[32:47], v48, v49 cbsz:1 blgp:4 + + +// fp4 x bf8 +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:19], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:4 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0c,0xae,0xd3,0x10,0x31,0x82,0x24] +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:19], v[24:31], v[32:47], v48, v49 cbsz:4 blgp:1 + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:19], a[24:31], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:4 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8c,0xae,0xd3,0x10,0x31,0x82,0x3c] +v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:19], a[24:31], a[32:47], v48, v49 cbsz:4 blgp:1 + + +// bf6 x bf8 +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0b,0xae,0xd3,0x10,0x31,0x82,0x24] +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:31], v[32:47], v48, v49 cbsz:3 blgp:1 + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:21], a[24:31], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8b,0xae,0xd3,0x10,0x31,0x82,0x3c] +v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:21], a[24:31], a[32:47], v48, v49 cbsz:3 blgp:1 + + +// bf8 x bf6 +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:29], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:1 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x09,0xae,0xd3,0x10,0x31,0x82,0x64] +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:29], v[32:47], v48, v49 cbsz:1 blgp:3 + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:23], a[24:29], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:1 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x89,0xae,0xd3,0x10,0x31,0x82,0x7c] +v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:23], a[24:29], a[32:47], v48, v49 cbsz:1 blgp:3 + +// bf6 x bf6 +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:3 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0b,0xae,0xd3,0x10,0x31,0x82,0x64] +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 cbsz:3 blgp:3 + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:21], a[24:29], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:3 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8b,0xae,0xd3,0x10,0x31,0x82,0x7c] +v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:21], a[24:29], a[32:47], v48, v49 cbsz:3 blgp:3 + +// bf6 x fp6 +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:3 blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0b,0xae,0xd3,0x10,0x31,0x82,0x44] +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 cbsz:3 blgp:2 + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:21], a[24:29], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:3 blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8b,0xae,0xd3,0x10,0x31,0x82,0x5c] +v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:21], a[24:29], a[32:47], v48, v49 cbsz:3 blgp:2 + + +// fp6 x bf6 +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:2 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0a,0xae,0xd3,0x10,0x31,0x82,0x64] +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 cbsz:2 blgp:3 + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:21], a[24:29], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:2 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8a,0xae,0xd3,0x10,0x31,0x82,0x7c] +v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:21], a[24:29], a[32:47], v48, v49 cbsz:2 blgp:3 + + +// fp6 x fp4 +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:27], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:2 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0a,0xae,0xd3,0x10,0x31,0x82,0x84] +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:27], v[32:47], v48, v49 cbsz:2 blgp:4 + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:21], a[24:27], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:2 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8a,0xae,0xd3,0x10,0x31,0x82,0x9c] +v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:21], a[24:27], a[32:47], v48, v49 cbsz:2 blgp:4 + + +// fp4 x fp6 +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:19], v[24:29], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:4 blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0c,0xae,0xd3,0x10,0x31,0x82,0x44] +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:19], v[24:29], v[32:47], v48, v49 cbsz:4 blgp:2 + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:19], a[24:29], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:4 blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8c,0xae,0xd3,0x10,0x31,0x82,0x5c] +v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:19], a[24:29], a[32:47], v48, v49 cbsz:4 blgp:2 + + +// fp4 x fp4 +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:19], v[24:27], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:4 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0c,0xae,0xd3,0x10,0x31,0x82,0x84] +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:19], v[24:27], v[32:47], v48, v49 cbsz:4 blgp:4 + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:19], a[24:27], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:4 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8c,0xae,0xd3,0x10,0x31,0x82,0x9c] +v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:19], a[24:27], a[32:47], v48, v49 cbsz:4 blgp:4 + diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt index 1fa48fca80fb4..b35789cbf500f 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt @@ -159,3 +159,411 @@ # GFX950: v_mfma_ld_scale_b32 vcc_lo, vcc_lo ; encoding: [0x00,0x40,0xac,0xd3,0x6a,0xd4,0x00,0x18] 0x00,0x40,0xac,0xd3,0x6a,0xd4,0x00,0x18 + +# GFX950: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] +0x00,0x00,0x80,0xbf + +# GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:15], a[20:23] cbsz:1 blgp:4 ; encoding: [0x00,0x81,0xad,0xd3,0x04,0x19,0x52,0x9c] +0x00,0x81,0xad,0xd3,0x04,0x19,0x52,0x9c + +# GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:17], a[20:23] cbsz:1 blgp:3 ; encoding: [0x00,0x81,0xad,0xd3,0x04,0x19,0x52,0x7c] +0x00,0x81,0xad,0xd3,0x04,0x19,0x52,0x7c + +# GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[4:11], a[0:3] ; encoding: [0x00,0x80,0xad,0xd3,0x04,0x09,0x02,0x1c] +0x00,0x80,0xad,0xd3,0x04,0x09,0x02,0x1c + +# GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[4:11], a[0:3] blgp:1 ; encoding: [0x00,0x80,0xad,0xd3,0x04,0x09,0x02,0x3c] +0x00,0x80,0xad,0xd3,0x04,0x09,0x02,0x3c + +# GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[4:9], a[0:3] cbsz:1 blgp:3 ; encoding: [0x00,0x81,0xad,0xd3,0x04,0x09,0x02,0x7c] +0x00,0x81,0xad,0xd3,0x04,0x09,0x02,0x7c + +# GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:15], a[20:23] cbsz:4 blgp:4 ; encoding: [0x00,0x84,0xad,0xd3,0x04,0x19,0x52,0x9c] +0x00,0x84,0xad,0xd3,0x04,0x19,0x52,0x9c + +# GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:17], a[20:23] cbsz:4 blgp:2 ; encoding: [0x00,0x84,0xad,0xd3,0x04,0x19,0x52,0x5c] +0x00,0x84,0xad,0xd3,0x04,0x19,0x52,0x5c + +# GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:19], a[20:23] cbsz:4 blgp:1 ; encoding: [0x00,0x84,0xad,0xd3,0x04,0x19,0x52,0x3c] +0x00,0x84,0xad,0xd3,0x04,0x19,0x52,0x3c + +# GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:15], a[20:23] cbsz:2 blgp:4 ; encoding: [0x00,0x82,0xad,0xd3,0x04,0x19,0x52,0x9c] +0x00,0x82,0xad,0xd3,0x04,0x19,0x52,0x9c + +# GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:17], a[20:23] cbsz:2 blgp:3 ; encoding: [0x00,0x82,0xad,0xd3,0x04,0x19,0x52,0x7c] +0x00,0x82,0xad,0xd3,0x04,0x19,0x52,0x7c + +# GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:17], a[20:23] cbsz:3 blgp:2 ; encoding: [0x00,0x83,0xad,0xd3,0x04,0x19,0x52,0x5c] +0x00,0x83,0xad,0xd3,0x04,0x19,0x52,0x5c + +# GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:17], a[20:23] cbsz:3 blgp:3 ; encoding: [0x00,0x83,0xad,0xd3,0x04,0x19,0x52,0x7c] +0x00,0x83,0xad,0xd3,0x04,0x19,0x52,0x7c + +# GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:19], a[20:23] cbsz:3 blgp:1 ; encoding: [0x00,0x83,0xad,0xd3,0x04,0x19,0x52,0x3c] +0x00,0x83,0xad,0xd3,0x04,0x19,0x52,0x3c + +# GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[4:11], a[0:3] cbsz:3 ; encoding: [0x00,0x83,0xad,0xd3,0x04,0x09,0x02,0x1c] +0x00,0x83,0xad,0xd3,0x04,0x09,0x02,0x1c + +# GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:15], v[20:23] cbsz:1 blgp:4 ; encoding: [0x00,0x01,0xad,0xd3,0x04,0x19,0x52,0x84] +0x00,0x01,0xad,0xd3,0x04,0x19,0x52,0x84 + +# GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:17], v[20:23] cbsz:1 blgp:3 ; encoding: [0x00,0x01,0xad,0xd3,0x04,0x19,0x52,0x64] +0x00,0x01,0xad,0xd3,0x04,0x19,0x52,0x64 + +# GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] ; encoding: [0x00,0x00,0xad,0xd3,0x04,0x09,0x02,0x04] +0x00,0x00,0xad,0xd3,0x04,0x09,0x02,0x04 + +# GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] blgp:1 ; encoding: [0x00,0x00,0xad,0xd3,0x04,0x09,0x02,0x24] +0x00,0x00,0xad,0xd3,0x04,0x09,0x02,0x24 + +# GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:15], v[20:23] cbsz:4 blgp:4 ; encoding: [0x00,0x04,0xad,0xd3,0x04,0x19,0x52,0x84] +0x00,0x04,0xad,0xd3,0x04,0x19,0x52,0x84 + +# GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:17], v[20:23] cbsz:4 blgp:2 ; encoding: [0x00,0x04,0xad,0xd3,0x04,0x19,0x52,0x44] +0x00,0x04,0xad,0xd3,0x04,0x19,0x52,0x44 + +# GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:19], v[20:23] cbsz:4 blgp:1 ; encoding: [0x00,0x04,0xad,0xd3,0x04,0x19,0x52,0x24] +0x00,0x04,0xad,0xd3,0x04,0x19,0x52,0x24 + +# GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:15], v[20:23] cbsz:2 blgp:4 ; encoding: [0x00,0x02,0xad,0xd3,0x04,0x19,0x52,0x84] +0x00,0x02,0xad,0xd3,0x04,0x19,0x52,0x84 + +# GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:17], v[20:23] cbsz:2 blgp:3 ; encoding: [0x00,0x02,0xad,0xd3,0x04,0x19,0x52,0x64] +0x00,0x02,0xad,0xd3,0x04,0x19,0x52,0x64 + +# GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:17], v[20:23] cbsz:3 blgp:2 ; encoding: [0x00,0x03,0xad,0xd3,0x04,0x19,0x52,0x44] +0x00,0x03,0xad,0xd3,0x04,0x19,0x52,0x44 + +# GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:17], v[20:23] cbsz:3 blgp:3 ; encoding: [0x00,0x03,0xad,0xd3,0x04,0x19,0x52,0x64] +0x00,0x03,0xad,0xd3,0x04,0x19,0x52,0x64 + +# GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23] cbsz:3 blgp:1 ; encoding: [0x00,0x03,0xad,0xd3,0x04,0x19,0x52,0x24] +0x00,0x03,0xad,0xd3,0x04,0x19,0x52,0x24 + +# GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[4:11], v[0:3] cbsz:3 ; encoding: [0x00,0x03,0xad,0xd3,0x04,0x09,0x02,0x04] +0x00,0x03,0xad,0xd3,0x04,0x09,0x02,0x04 + +# GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[4:11], v[0:3] cbsz:3 blgp:1 ; encoding: [0x00,0x03,0xad,0xd3,0x04,0x09,0x02,0x24] +0x00,0x03,0xad,0xd3,0x04,0x09,0x02,0x24 + +# GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:11], a[12:15], a[16:31] cbsz:1 blgp:4 ; encoding: [0x00,0x81,0xae,0xd3,0x04,0x19,0x42,0x9c] +0x00,0x81,0xae,0xd3,0x04,0x19,0x42,0x9c + +# GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:11], a[12:17], a[16:31] cbsz:1 blgp:3 ; encoding: [0x00,0x81,0xae,0xd3,0x04,0x19,0x42,0x7c] +0x00,0x81,0xae,0xd3,0x04,0x19,0x42,0x7c + +# GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:11], a[4:11], a[0:15] ; encoding: [0x00,0x80,0xae,0xd3,0x04,0x09,0x02,0x1c] +0x00,0x80,0xae,0xd3,0x04,0x09,0x02,0x1c + +# GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:11], a[4:11], a[0:15] blgp:1 ; encoding: [0x00,0x80,0xae,0xd3,0x04,0x09,0x02,0x3c] +0x00,0x80,0xae,0xd3,0x04,0x09,0x02,0x3c + +# GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:11], a[4:9], a[0:15] cbsz:1 blgp:3 ; encoding: [0x00,0x81,0xae,0xd3,0x04,0x09,0x02,0x7c] +0x00,0x81,0xae,0xd3,0x04,0x09,0x02,0x7c + +# GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:7], a[12:15], a[16:31] cbsz:4 blgp:4 ; encoding: [0x00,0x84,0xae,0xd3,0x04,0x19,0x42,0x9c] +0x00,0x84,0xae,0xd3,0x04,0x19,0x42,0x9c + +# GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:7], a[12:17], a[16:31] cbsz:4 blgp:2 ; encoding: [0x00,0x84,0xae,0xd3,0x04,0x19,0x42,0x5c] +0x00,0x84,0xae,0xd3,0x04,0x19,0x42,0x5c + +# GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:7], a[8:15], a[16:31] cbsz:4 blgp:1 ; encoding: [0x00,0x84,0xae,0xd3,0x04,0x11,0x42,0x3c] +0x00,0x84,0xae,0xd3,0x04,0x11,0x42,0x3c + +# GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:9], a[12:15], a[16:31] cbsz:2 blgp:4 ; encoding: [0x00,0x82,0xae,0xd3,0x04,0x19,0x42,0x9c] +0x00,0x82,0xae,0xd3,0x04,0x19,0x42,0x9c + +# GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:9], a[12:17], a[16:31] cbsz:2 blgp:3 ; encoding: [0x00,0x82,0xae,0xd3,0x04,0x19,0x42,0x7c] +0x00,0x82,0xae,0xd3,0x04,0x19,0x42,0x7c + +# GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:9], a[12:17], a[16:31] cbsz:3 blgp:2 ; encoding: [0x00,0x83,0xae,0xd3,0x04,0x19,0x42,0x5c] +0x00,0x83,0xae,0xd3,0x04,0x19,0x42,0x5c + +# GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:9], a[12:17], a[16:31] cbsz:3 blgp:3 ; encoding: [0x00,0x83,0xae,0xd3,0x04,0x19,0x42,0x7c] +0x00,0x83,0xae,0xd3,0x04,0x19,0x42,0x7c + +# GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:9], a[12:19], a[16:31] cbsz:3 blgp:1 ; encoding: [0x00,0x83,0xae,0xd3,0x04,0x19,0x42,0x3c] +0x00,0x83,0xae,0xd3,0x04,0x19,0x42,0x3c + +# GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:9], a[4:11], a[0:15] cbsz:3 ; encoding: [0x00,0x83,0xae,0xd3,0x04,0x09,0x02,0x1c] +0x00,0x83,0xae,0xd3,0x04,0x09,0x02,0x1c + +# GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[12:15], v[16:31] cbsz:1 blgp:4 ; encoding: [0x00,0x01,0xae,0xd3,0x04,0x19,0x42,0x84] +0x00,0x01,0xae,0xd3,0x04,0x19,0x42,0x84 + +# GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[12:17], v[16:31] cbsz:1 blgp:3 ; encoding: [0x00,0x01,0xae,0xd3,0x04,0x19,0x42,0x64] +0x00,0x01,0xae,0xd3,0x04,0x19,0x42,0x64 + +# GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] ; encoding: [0x00,0x00,0xae,0xd3,0x04,0x09,0x02,0x04] +0x00,0x00,0xae,0xd3,0x04,0x09,0x02,0x04 + +# GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] blgp:1 ; encoding: [0x00,0x00,0xae,0xd3,0x04,0x09,0x02,0x24] +0x00,0x00,0xae,0xd3,0x04,0x09,0x02,0x24 + +# GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:7], v[12:15], v[16:31] cbsz:4 blgp:4 ; encoding: [0x00,0x04,0xae,0xd3,0x04,0x19,0x42,0x84] +0x00,0x04,0xae,0xd3,0x04,0x19,0x42,0x84 + +# GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:7], v[12:17], v[16:31] cbsz:4 blgp:2 ; encoding: [0x00,0x04,0xae,0xd3,0x04,0x19,0x42,0x44] +0x00,0x04,0xae,0xd3,0x04,0x19,0x42,0x44 + +# GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:7], v[8:15], v[16:31] cbsz:4 blgp:1 ; encoding: [0x00,0x04,0xae,0xd3,0x04,0x11,0x42,0x24] +0x00,0x04,0xae,0xd3,0x04,0x11,0x42,0x24 + +# GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[12:15], v[16:31] cbsz:2 blgp:4 ; encoding: [0x00,0x02,0xae,0xd3,0x04,0x19,0x42,0x84] +0x00,0x02,0xae,0xd3,0x04,0x19,0x42,0x84 + +# GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[12:17], v[16:31] cbsz:2 blgp:3 ; encoding: [0x00,0x02,0xae,0xd3,0x04,0x19,0x42,0x64] +0x00,0x02,0xae,0xd3,0x04,0x19,0x42,0x64 + +# GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[12:17], v[16:31] cbsz:3 blgp:2 ; encoding: [0x00,0x03,0xae,0xd3,0x04,0x19,0x42,0x44] +0x00,0x03,0xae,0xd3,0x04,0x19,0x42,0x44 + +# GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[12:17], v[16:31] cbsz:3 blgp:3 ; encoding: [0x00,0x03,0xae,0xd3,0x04,0x19,0x42,0x64] +0x00,0x03,0xae,0xd3,0x04,0x19,0x42,0x64 + +# GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[12:19], v[16:31] cbsz:3 blgp:1 ; encoding: [0x00,0x03,0xae,0xd3,0x04,0x19,0x42,0x24] +0x00,0x03,0xae,0xd3,0x04,0x19,0x42,0x24 + +# GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[4:11], v[0:15] cbsz:3 ; encoding: [0x00,0x03,0xae,0xd3,0x04,0x09,0x02,0x04] +0x00,0x03,0xae,0xd3,0x04,0x09,0x02,0x04 + +# GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[4:11], v[0:15] cbsz:3 blgp:1 ; encoding: [0x00,0x03,0xae,0xd3,0x04,0x09,0x02,0x24] +0x00,0x03,0xae,0xd3,0x04,0x09,0x02,0x24 + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:15], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:1 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x89,0xad,0xd3,0x04,0x19,0x52,0x9c] +0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x89,0xad,0xd3,0x04,0x19,0x52,0x9c + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:17], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:1 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x89,0xad,0xd3,0x04,0x19,0x52,0x7c] +0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x89,0xad,0xd3,0x04,0x19,0x52,0x7c + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:19], a[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x88,0xad,0xd3,0x04,0x19,0x52,0x1c] +0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x88,0xad,0xd3,0x04,0x19,0x52,0x1c + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:19], a[20:23], v24, v25 op_sel_hi:[0,0,0] blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x88,0xad,0xd3,0x04,0x19,0x52,0x3c] +0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x88,0xad,0xd3,0x04,0x19,0x52,0x3c + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:19], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x89,0xad,0xd3,0x04,0x19,0x52,0x1c] +0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x89,0xad,0xd3,0x04,0x19,0x52,0x1c + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:15], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8c,0xad,0xd3,0x04,0x19,0x52,0x9c] +0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8c,0xad,0xd3,0x04,0x19,0x52,0x9c + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:17], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8c,0xad,0xd3,0x04,0x19,0x52,0x5c] +0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8c,0xad,0xd3,0x04,0x19,0x52,0x5c + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:19], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8c,0xad,0xd3,0x04,0x19,0x52,0x3c] +0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8c,0xad,0xd3,0x04,0x19,0x52,0x3c + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:15], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:2 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8a,0xad,0xd3,0x04,0x19,0x52,0x9c] +0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8a,0xad,0xd3,0x04,0x19,0x52,0x9c + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:17], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:2 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8a,0xad,0xd3,0x04,0x19,0x52,0x7c] +0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8a,0xad,0xd3,0x04,0x19,0x52,0x7c + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:17], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8b,0xad,0xd3,0x04,0x19,0x52,0x5c] +0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8b,0xad,0xd3,0x04,0x19,0x52,0x5c + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:17], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8b,0xad,0xd3,0x04,0x19,0x52,0x7c] +0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8b,0xad,0xd3,0x04,0x19,0x52,0x7c + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:19], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8b,0xad,0xd3,0x04,0x19,0x52,0x3c] +0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8b,0xad,0xd3,0x04,0x19,0x52,0x3c + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[4:11], v[12:19], a[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x88,0xad,0xd3,0x04,0x19,0x52,0x04] +0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x88,0xad,0xd3,0x04,0x19,0x52,0x04 + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], a[4:11], a[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x1c] +0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x1c + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], a[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x0c] +0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x0c + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], a[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x14] +0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x14 + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:15], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:1 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x09,0xad,0xd3,0x04,0x19,0x52,0x84] +0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x09,0xad,0xd3,0x04,0x19,0x52,0x84 + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:17], v[20:23], v24, v25 op_sel_hi:[0,0,0] blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x44] +0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x44 + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:17], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:1 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x09,0xad,0xd3,0x04,0x19,0x52,0x64] +0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x09,0xad,0xd3,0x04,0x19,0x52,0x64 + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 33, 9 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xa1,0x12,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] +0x00,0x00,0xac,0xd3,0xa1,0x12,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04 + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 9, v2 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x89,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] +0x00,0x00,0xac,0xd3,0x89,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04 + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], m0, m0 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x7c,0xf8,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] +0x00,0x00,0xac,0xd3,0x7c,0xf8,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04 + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s20, 9 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x14,0x12,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] +0x00,0x00,0xac,0xd3,0x14,0x12,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04 + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s24, s24 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x30,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] +0x00,0x00,0xac,0xd3,0x18,0x30,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04 + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s24, v44 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x58,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] +0x00,0x00,0xac,0xd3,0x18,0x58,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04 + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v2, 9 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x02,0x13,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] +0x00,0x00,0xac,0xd3,0x02,0x13,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04 + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] +0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04 + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] +0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04 + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x24] +0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x24 + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x09,0xad,0xd3,0x04,0x19,0x52,0x04] +0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x09,0xad,0xd3,0x04,0x19,0x52,0x04 + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v44, s24 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x2c,0x31,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] +0x00,0x00,0xac,0xd3,0x2c,0x31,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04 + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], vcc_lo, v2 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x6a,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] +0x00,0x00,0xac,0xd3,0x6a,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04 + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:15], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0c,0xad,0xd3,0x04,0x19,0x52,0x84] +0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0c,0xad,0xd3,0x04,0x19,0x52,0x84 + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:17], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0c,0xad,0xd3,0x04,0x19,0x52,0x44] +0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0c,0xad,0xd3,0x04,0x19,0x52,0x44 + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0c,0xad,0xd3,0x04,0x19,0x52,0x24] +0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0c,0xad,0xd3,0x04,0x19,0x52,0x24 + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:15], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:2 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0a,0xad,0xd3,0x04,0x19,0x52,0x84] +0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0a,0xad,0xd3,0x04,0x19,0x52,0x84 + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:17], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:2 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0a,0xad,0xd3,0x04,0x19,0x52,0x64] +0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0a,0xad,0xd3,0x04,0x19,0x52,0x64 + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:17], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x44] +0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x44 + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:17], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x64] +0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x64 + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x04] +0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x04 + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] +0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24 + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x08,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] +0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x08,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24 + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] +0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24 + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[50:53], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x32,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] +0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x32,0x08,0xad,0xd3,0x04,0x19,0x52,0x04 + +# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:19], a[24:27], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:4 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8c,0xae,0xd3,0x10,0x31,0x82,0x9c] +0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8c,0xae,0xd3,0x10,0x31,0x82,0x9c + +# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:19], a[24:29], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:4 blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8c,0xae,0xd3,0x10,0x31,0x82,0x5c] +0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8c,0xae,0xd3,0x10,0x31,0x82,0x5c + +# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:19], a[24:31], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:4 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8c,0xae,0xd3,0x10,0x31,0x82,0x3c] +0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8c,0xae,0xd3,0x10,0x31,0x82,0x3c + +# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:21], a[24:27], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:2 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8a,0xae,0xd3,0x10,0x31,0x82,0x9c] +0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8a,0xae,0xd3,0x10,0x31,0x82,0x9c + +# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:21], a[24:29], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:2 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8a,0xae,0xd3,0x10,0x31,0x82,0x7c] +0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8a,0xae,0xd3,0x10,0x31,0x82,0x7c + +# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:21], a[24:29], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:3 blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8b,0xae,0xd3,0x10,0x31,0x82,0x5c] +0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8b,0xae,0xd3,0x10,0x31,0x82,0x5c + +# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:21], a[24:29], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:3 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8b,0xae,0xd3,0x10,0x31,0x82,0x7c] +0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8b,0xae,0xd3,0x10,0x31,0x82,0x7c + +# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:21], a[24:31], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8b,0xae,0xd3,0x10,0x31,0x82,0x3c] +0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8b,0xae,0xd3,0x10,0x31,0x82,0x3c + +# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:23], a[24:27], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:1 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x89,0xae,0xd3,0x10,0x31,0x82,0x9c] +0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x89,0xae,0xd3,0x10,0x31,0x82,0x9c + +# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:23], a[24:29], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:1 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x89,0xae,0xd3,0x10,0x31,0x82,0x7c] +0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x89,0xae,0xd3,0x10,0x31,0x82,0x7c + +# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:23], a[24:31], a[32:47], v48, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x88,0xae,0xd3,0x10,0x31,0x82,0x1c] +0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x88,0xae,0xd3,0x10,0x31,0x82,0x1c + +# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:23], a[24:31], a[32:47], v48, v49 op_sel_hi:[0,0,0] blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x88,0xae,0xd3,0x10,0x31,0x82,0x3c] +0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x88,0xae,0xd3,0x10,0x31,0x82,0x3c + +# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:23], a[24:31], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:1 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x89,0xae,0xd3,0x10,0x31,0x82,0x1c] +0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x89,0xae,0xd3,0x10,0x31,0x82,0x1c + +# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:23], v[24:31], a[32:47], v48, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x88,0xae,0xd3,0x10,0x31,0x82,0x0c] +0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x88,0xae,0xd3,0x10,0x31,0x82,0x0c + +# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[24:31], a[32:47], v48, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x88,0xae,0xd3,0x10,0x31,0x82,0x04] +0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x88,0xae,0xd3,0x10,0x31,0x82,0x04 + +# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], a[16:23], a[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x1c] +0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x1c + +# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:19], v[24:27], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:4 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0c,0xae,0xd3,0x10,0x31,0x82,0x84] +0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0c,0xae,0xd3,0x10,0x31,0x82,0x84 + +# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:19], v[24:29], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:4 blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0c,0xae,0xd3,0x10,0x31,0x82,0x44] +0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0c,0xae,0xd3,0x10,0x31,0x82,0x44 + +# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:19], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:4 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0c,0xae,0xd3,0x10,0x31,0x82,0x24] +0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0c,0xae,0xd3,0x10,0x31,0x82,0x24 + +# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:27], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:2 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0a,0xae,0xd3,0x10,0x31,0x82,0x84] +0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0a,0xae,0xd3,0x10,0x31,0x82,0x84 + +# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:2 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0a,0xae,0xd3,0x10,0x31,0x82,0x64] +0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0a,0xae,0xd3,0x10,0x31,0x82,0x64 + +# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:2 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x18,0x00,0x0a,0xae,0xd3,0x10,0x31,0x82,0x64] +0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x18,0x00,0x0a,0xae,0xd3,0x10,0x31,0x82,0x64 + +# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:3 blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0b,0xae,0xd3,0x10,0x31,0x82,0x44] +0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0b,0xae,0xd3,0x10,0x31,0x82,0x44 + +# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:3 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0b,0xae,0xd3,0x10,0x31,0x82,0x64] +0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0b,0xae,0xd3,0x10,0x31,0x82,0x64 + +# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:2 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0a,0xae,0xd3,0x10,0x31,0x82,0x04] +0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0a,0xae,0xd3,0x10,0x31,0x82,0x04 + +# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0b,0xae,0xd3,0x10,0x31,0x82,0x24] +0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0b,0xae,0xd3,0x10,0x31,0x82,0x24 + +# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], a[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x14] +0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x14 + +# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:27], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:1 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x09,0xae,0xd3,0x10,0x31,0x82,0x84] +0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x09,0xae,0xd3,0x10,0x31,0x82,0x84 + +# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:29], v[32:47], v48, v49 op_sel_hi:[0,0,0] blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x44] +0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x44 + +# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:29], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:1 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x09,0xae,0xd3,0x10,0x31,0x82,0x64] +0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x09,0xae,0xd3,0x10,0x31,0x82,0x64 + +# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] +0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04 + +# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x24] +0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x24 + +# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:1 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x09,0xae,0xd3,0x10,0x31,0x82,0x04] +0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x09,0xae,0xd3,0x10,0x31,0x82,0x04 + +# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[50:65], v[16:23], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x32,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] +0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x32,0x08,0xae,0xd3,0x10,0x31,0x82,0x04 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx950_vop3px2.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx950_vop3px2.txt new file mode 100644 index 0000000000000..3227b619de4fd --- /dev/null +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx950_vop3px2.txt @@ -0,0 +1,28 @@ +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding -disassemble %s | FileCheck -check-prefix=GFX950 %s + +# Check behavior of truncated instruction just to the ld_scale + +# GFX950: v_mfma_ld_scale_b32 v20, v21 op_sel_hi:[0,0] ; encoding: [0x00,0x40,0xac,0xd3,0x14,0x2b,0x02,0x00] +0x00,0x00,0xac,0xd3,0x14,0x2b,0x02,0x00 + +# GFX950: v_mfma_ld_scale_b32 v20, v21 op_sel_hi:[0,0] ; encoding: [0x00,0x40,0xac,0xd3,0x14,0x2b,0x02,0x00] +0x00,0x00,0xac,0xd3,0x14,0x2b,0x02,0x00 + +# GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] ; encoding: [0x00,0x80,0xad,0xd3,0x00,0x11,0x02,0x04] +0x00,0x00,0xac,0xd3,0x14,0x2b,0x02,0x00,0x00,0x80,0xad,0xd3,0x00,0x11,0x02,0x04 + +# GFX950: v_mfma_ld_scale_b32 v20, v21 op_sel_hi:[0,0] ; encoding: [0x00,0x40,0xac,0xd3,0x14,0x2b,0x02,0x00] +0x00,0x00,0xac,0xd3,0x14,0x2b,0x02,0x00 + +# GFX950: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] +0x00,0x00,0x80,0xbf + +# GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] ; encoding: [0x00,0x80,0xad,0xd3,0x00,0x11,0x02,0x04] +0x00,0x80,0xad,0xd3,0x00,0x11,0x02,0x04 + + +# GFX950: v_mfma_ld_scale_b32 v20, v21 op_sel_hi:[0,0] ; encoding: [0x00,0x40,0xac,0xd3,0x14,0x2b,0x02,0x00] +0x00,0x00,0xac,0xd3,0x14,0x2b,0x02,0x00 + +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x14,0x2b,0x02,0x00,0x00,0x88,0xad,0xd3,0x00,0x11,0x02,0x04] +0x00,0x00,0xac,0xd3,0x14,0x2b,0x02,0x00,0x00,0x88,0xad,0xd3,0x00,0x11,0x02,0x04 diff --git a/llvm/test/tools/llvm-mca/AMDGPU/gfx950.s b/llvm/test/tools/llvm-mca/AMDGPU/gfx950.s index e601de8d706b4..3ab3319c21ecd 100644 --- a/llvm/test/tools/llvm-mca/AMDGPU/gfx950.s +++ b/llvm/test/tools/llvm-mca/AMDGPU/gfx950.s @@ -1,11 +1,13 @@ # RUN: llvm-mca -mtriple=amdgcn -mcpu=gfx950 --timeline --iterations=1 --timeline-max-cycles=0 < %s | FileCheck %s # CHECK: Iterations: 1 -# CHECK: Instructions: 7 -# CHECK: Total Cycles: 42 -# CHECK: Total uOps: 7 +# CHECK: Instructions: 9 +# CHECK: Total Cycles: 44 +# CHECK: Total uOps: 9 v_mfma_ld_scale_b32 v0, v0 +v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] +v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1 v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[0:3], a[4:7] @@ -14,9 +16,10 @@ v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], a[0:15] blgp:2 v_mfma_f32_32x32x16_bf16 v[0:15], v[0:3], v[0:3], v[0:15] v_mfma_f32_32x32x16_bf16 a[0:15], a[0:3], a[0:3], a[0:15] blgp:2 - # CHECK: [0] [1] [2] [3] [4] [5] [6] Instructions: # CHECK-NEXT: - - - - 1.00 - - v_mfma_ld_scale_b32 v0, v0 +# CHECK-NEXT: - - - - 1.00 - - v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] +# CHECK-NEXT: - - - - 1.00 - - v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] # CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1 # CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[0:3], a[4:7] # CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15]