Skip to content

AMDGPU: Add first gfx950 mfma instructions #116312

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions clang/include/clang/Basic/BuiltinsAMDGPU.def
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,12 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32, "iffiIb", "nc", "fp8-conversion-
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf8_f32, "ifiiIi", "nc", "fp8-conversion-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", "nc", "fp8-conversion-insts")

//===----------------------------------------------------------------------===//
// GFX950 only builtins.
//===----------------------------------------------------------------------===//
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x32_f16, "V4fV8hV8hV4fIiIiIi", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_f16, "V16fV8hV8hV16fIiIiIi", "nc", "gfx950-insts")

//===----------------------------------------------------------------------===//
// GFX12+ only builtins.
//===----------------------------------------------------------------------===//
Expand Down
25 changes: 23 additions & 2 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx908 -DMFMA_GFX908_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX908
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx90a -DMFMA_GFX90A_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX90A
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -DMFMA_GFX940_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX940
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx950 -DMFMA_GFX950_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX950

#pragma OPENCL EXTENSION cl_khr_fp64:enable

Expand Down Expand Up @@ -222,7 +223,7 @@ void test_mfma_f64_4x4x4f64(global double* out, double a, double b, double c)

#endif // MFMA_GFX90A_TESTS

#ifdef MFMA_GFX940_TESTS
#if defined(MFMA_GFX940_TESTS) || defined(MFMA_GFX950_TESTS)
// CHECK-GFX940-LABEL: @test_mfma_i32_16x16x32_i8
// CHECK-GFX940: call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64 %a, i64 %b, <4 x i32> %c, i32 0, i32 0, i32 0)
void test_mfma_i32_16x16x32_i8(global v4i* out, long a, long b, v4i c)
Expand Down Expand Up @@ -404,4 +405,24 @@ void test_smfmac_f32_32x32x32_fp8_fp8(global v16f* out, v2i a, v4i b, v16f c, in
{
*out = __builtin_amdgcn_smfmac_f32_32x32x32_fp8_fp8(a, b, c, idx, 0, 0);
}
#endif // MFMA_GFX940_TESTS
#endif // defined(MFMA_GFX940_TESTS) || defined(MFMA_GFX950_TESTS)

#ifdef MFMA_GFX950_TESTS

// CHECK-GFX950-LABEL: @test_mfma_f32_16x16x32_f16(
// CHECK-GFX950: tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %a, <8 x half> %b, <4 x float> %c, i32 1, i32 2, i32 3)

v4f test_mfma_f32_16x16x32_f16(v8h a, v8h b, v4f c)
{
return __builtin_amdgcn_mfma_f32_16x16x32_f16(a, b, c, 1, 2, 3);
}

// CHECK-GFX950-LABEL: @test_mfma_f32_32x32x16_f16
// CHECK-GFX950: tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %a, <8 x half> %b, <16 x float> %c, i32 1, i32 2, i32 3)
v16f test_mfma_f32_32x32x16_f16(v8h a, v8h b, v16f c)
{
return __builtin_amdgcn_mfma_f32_32x32x16_f16(a, b, c, 1, 2, 3);
}


#endif
21 changes: 21 additions & 0 deletions clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx950 -verify -S -o - %s

typedef float float4 __attribute__((ext_vector_type(4)));
typedef float float16 __attribute__((ext_vector_type(16)));
typedef half half8 __attribute__((ext_vector_type(8)));


void test_mfma_f32_16x16x32_f16(__global float4* out, half8 a, half8 b, float4 c, int X) {

*out = __builtin_amdgcn_mfma_f32_16x16x32_f16(a, b, c, X, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_f16' must be a constant integer}}
*out = __builtin_amdgcn_mfma_f32_16x16x32_f16(a, b, c, 0, X, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_f16' must be a constant integer}}
*out = __builtin_amdgcn_mfma_f32_16x16x32_f16(a, b, c, 0, 0, X); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_f16' must be a constant integer}}
}


void test_mfma_f32_32x32x16_f16(__global float16* out, half8 a, half8 b, float16 c, int X) {
*out = __builtin_amdgcn_mfma_f32_32x32x16_f16(a, b, c, X, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_f16' must be a constant integer}}
*out = __builtin_amdgcn_mfma_f32_32x32x16_f16(a, b, c, 0, X, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_f16' must be a constant integer}}
*out = __builtin_amdgcn_mfma_f32_32x32x16_f16(a, b, c, 0, 0, X); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_f16' must be a constant integer}}
}
12 changes: 12 additions & 0 deletions clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx940 -verify -S -o - %s

typedef float float4 __attribute__((ext_vector_type(4)));
typedef float float16 __attribute__((ext_vector_type(16)));
typedef half half8 __attribute__((ext_vector_type(8)));

void test(__global float4* out0, half8 a0, half8 b0, float4 c0,
__global float16* out1, half8 a1, half8 b1, float16 c1) {
*out0 = __builtin_amdgcn_mfma_f32_16x16x32_f16(a0, b0, c0, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_16x16x32_f16' needs target feature gfx950-insts}}
*out1 = __builtin_amdgcn_mfma_f32_32x32x16_f16(a1, b1, c1, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_32x32x16_f16' needs target feature gfx950-insts}}
}
9 changes: 9 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -3110,6 +3110,15 @@ def int_amdgcn_cvt_sr_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_fp8_f32">,
[llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, ImmArg<ArgIndex<3>>]>;

//===----------------------------------------------------------------------===//
// gfx950 intrinsics
//===----------------------------------------------------------------------===//

defset list<Intrinsic> AMDGPUMFMAIntrinsics950 = {
def int_amdgcn_mfma_f32_16x16x32_f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v8f16_ty>;
def int_amdgcn_mfma_f32_32x32x16_f16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v8f16_ty>;
}

//===----------------------------------------------------------------------===//
// Special Intrinsics for backend internal use only. No frontend
// should emit calls to these.
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -1983,6 +1983,10 @@ def isNotGFX940Plus :
Predicate<"!Subtarget->hasGFX940Insts()">,
AssemblerPredicate<(all_of (not FeatureGFX940Insts))>;

def HasGFX950Insts :
Predicate<"Subtarget->hasGFX950Insts()">,
AssemblerPredicate<(all_of FeatureGFX950Insts)>;

def isGFX8GFX9NotGFX940 :
Predicate<"!Subtarget->hasGFX940Insts() &&"
"(Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||"
Expand Down
4 changes: 3 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4747,7 +4747,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8:
case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8:
case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8:
case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8: {
case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8:
case Intrinsic::amdgcn_mfma_f32_16x16x32_f16:
case Intrinsic::amdgcn_mfma_f32_32x32x16_f16: {
// Default for MAI intrinsics.
// srcC can also be an immediate which can be folded later.
// FIXME: Should we eventually add an alternative mapping with AGPR src
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,8 @@ foreach intr = AMDGPUMFMAIntrinsics90A in
def : SourceOfDivergence<intr>;
foreach intr = AMDGPUMFMAIntrinsics940 in
def : SourceOfDivergence<intr>;
foreach intr = AMDGPUMFMAIntrinsics950 in
def : SourceOfDivergence<intr>;
foreach intr = AMDGPUWMMAIntrinsicsGFX11 in
def : SourceOfDivergence<intr>;
foreach intr = AMDGPUWMMAIntrinsicsGFX12 in
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -1285,6 +1285,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
// hasGFX90AInsts is also true.
bool hasGFX940Insts() const { return GFX940Insts; }

// GFX950 is a derivation to GFX940. hasGFX950Insts() implies that
// hasGFX940Insts and hasGFX90AInsts are also true.
bool hasGFX950Insts() const { return GFX950Insts; }

bool hasSALUFloatInsts() const { return HasSALUFloatInsts; }

bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; }
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -2845,6 +2845,10 @@ def VOP_V16I32_V2I32_V4I32_I32 : VOPProfile <[v16i32, v2i32, v4i32, i32]>;
def VOP_V4F32_V2I32_V4I32_I32 : VOPProfile <[v4f32, v2i32, v4i32, i32]>;
def VOP_V16F32_V2I32_V4I32_I32 : VOPProfile <[v16f32, v2i32, v4i32, i32]>;

def VOP_V4F32_V8F16_V8F16_V4F32 : VOPProfile <[v4f32, v8f16, v8f16, v4f32]>;
def VOP_V16F32_V8F16_V8F16_V16F32 : VOPProfile <[v16f32, v8f16, v8f16, v16f32]>;


class Commutable_REV <string revOp, bit isOrig> {
string RevOp = revOp;
bit IsOrig = isOrig;
Expand Down
22 changes: 22 additions & 0 deletions llvm/lib/Target/AMDGPU/VOP3PInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -626,6 +626,11 @@ def VOPProfileSMFMAC_I32_32X32X32_I8 : VOPProfileSMFMAC<VOP_V16I32_V2I32_V4I32_
def VOPProfileSMFMAC_F32_16X16X64_F8 : VOPProfileSMFMAC<VOP_V4F32_V2I32_V4I32_I32, AVDst_128, AVSrc_64, AVSrc_128>;
def VOPProfileSMFMAC_F32_32X32X32_F8 : VOPProfileSMFMAC<VOP_V16F32_V2I32_V4I32_I32, AVDst_512, AVSrc_64, AVSrc_128>;

def VOPProfileMAI_F32_V8F16_X32 : VOPProfileMAI<VOP_V4F32_V8F16_V8F16_V4F32, AISrc_128_f32, ADst_128, AVSrc_128>;
def VOPProfileMAI_F32_V8F16_X32_VCD : VOPProfileMAI<VOP_V4F32_V8F16_V8F16_V4F32, VISrc_128_f32, VDst_128, AVSrc_128>;
def VOPProfileMAI_F32_V8F16_X16 : VOPProfileMAI<VOP_V16F32_V8F16_V8F16_V16F32, AISrc_512_f32, ADst_512, AVSrc_128>;
def VOPProfileMAI_F32_V8F16_X16_VCD : VOPProfileMAI<VOP_V16F32_V8F16_V8F16_V16F32, VISrc_512_f32, VDst_512, AVSrc_128>;

class MFMATable <bit is_mac, string Name> {
bit IsMac = is_mac;
string FMAOp = Name;
Expand Down Expand Up @@ -739,6 +744,11 @@ defm V_MFMA_F32_32X32X4BF16 : MAIInst<"v_mfma_f32_32x32x4bf16", "F32_V2I16_X16",

} // End SubtargetPredicate = HasMAIInsts

let SubtargetPredicate = HasGFX950Insts, is_gfx940_xdl = 1 in {
defm V_MFMA_F32_16X16X32_F16 : MAIInst<"v_mfma_f32_16x16x32f16", "F32_V8F16_X32", int_amdgcn_mfma_f32_16x16x32_f16>;
defm V_MFMA_F32_32X32X16_F16 : MAIInst<"v_mfma_f32_32x32x16f16", "F32_V8F16_X16", int_amdgcn_mfma_f32_32x32x16_f16>;
}

let Predicates = [isGFX90APlus] in {
let is_gfx940_xdl = 1 in {
defm V_MFMA_F32_32X32X4BF16_1K : MAIInst<"v_mfma_f32_32x32x4bf16_1k", "F32_V4I16_X32", int_amdgcn_mfma_f32_32x32x4bf16_1k>;
Expand Down Expand Up @@ -1650,6 +1660,16 @@ multiclass VOP3P_Real_MFMA_gfx940<bits<7> op, string Name = !cast<VOP3_Pseudo>(N
}
}

multiclass VOP3P_Real_MFMA_gfx950<bits<7> op, string Name = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic,
VOP3_Pseudo PS_ACD = !cast<VOP3_Pseudo>(NAME # "_e64"),
VOP3_Pseudo PS_VCD = !cast<VOP3_Pseudo>(NAME # "_vgprcd" # "_e64")> {
let SubtargetPredicate = HasGFX950Insts,
AssemblerPredicate = HasGFX950Insts in {
defm "" : VOP3P_Real_MFMA_gfx940<op, Name, PS_ACD, PS_VCD>;
}
}


multiclass VOP3P_Real_MFMA_vi<bits<7> op> {
def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl, ?> {
Expand Down Expand Up @@ -1764,6 +1784,8 @@ defm V_MFMA_F32_16X16X16BF16_1K : VOP3P_Real_MFMA_gfx90a <0x67>;
defm V_MFMA_F64_16X16X4F64 : VOP3P_Real_MFMA_gfx90a <0x6e>;
defm V_MFMA_F64_4X4X4F64 : VOP3P_Real_MFMA_gfx90a <0x6f>;

defm V_MFMA_F32_16X16X32_F16 : VOP3P_Real_MFMA_gfx950 <0x54, "v_mfma_f32_16x16x32_f16">;
defm V_MFMA_F32_32X32X16_F16 : VOP3P_Real_MFMA_gfx950 <0x55, "v_mfma_f32_32x32x16_f16">;
defm V_MFMA_I32_32X32X16I8 : VOP3P_Real_MFMA_gfx940 <0x56, "v_mfma_i32_32x32x16_i8">;
defm V_MFMA_I32_16X16X32I8 : VOP3P_Real_MFMA_gfx940 <0x57, "v_mfma_i32_16x16x32_i8">;
let SubtargetPredicate = HasXF32Insts in {
Expand Down
17 changes: 17 additions & 0 deletions llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,23 @@ bb:
ret void
}

declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half>, <8 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half>, <8 x half>, <16 x float>, i32 immarg, i32 immarg, i32 immarg)

; CHECK: DIVERGENT: %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
define amdgpu_kernel void @mfma_f32_16x16x32_f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, ptr addrspace(1) %out) {
%result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
store <4 x float> %result, ptr addrspace(1) %out
ret void
}

; CHECK: DIVERGENT: %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
define amdgpu_kernel void @mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, ptr addrspace(1) %out) {
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
store <16 x float> %result, ptr addrspace(1) %out
ret void
}

declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1
declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1) #1
declare i32 @llvm.amdgcn.permlanex16.i32(i32, i32, i32, i32, i1, i1) #1
Expand Down
Loading
Loading