diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td index 0f689e82bdb74..f48374c179c4d 100644 --- a/clang/include/clang/Basic/arm_sme.td +++ b/clang/include/clang/Basic/arm_sme.td @@ -824,4 +824,46 @@ let SMETargetGuard = "sme-lutv2" in { def SVLUTI4_ZT_X4 : SInst<"svluti4_zt_{d}_x4", "4i2.u", "cUc", MergeNone, "aarch64_sme_luti4_zt_x4", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>; } +let SMETargetGuard = "sme-f8f32" in { + def SVMOPA_FP8_ZA32 : Inst<"svmopa_za32[_mf8]_m_fpm", "viPPdd>", "m", MergeNone, "aarch64_sme_fp8_fmopa_za32", + [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<0, ImmCheck0_3>]>; + + // FMLALL (indexed) + def SVMLA_FP8_LANE_ZA32_VG4x1 : Inst<"svmla_lane_za32[_mf8]_vg4x1_fpm", "vmddi>", "m", MergeNone, "aarch64_sme_fp8_fmlall_lane_za32_vg4x1", + [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>; + def SVMLA_FP8_LANE_ZA32_VG4x2 : Inst<"svmla_lane_za32[_mf8]_vg4x2_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fmlall_lane_za32_vg4x2", + [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>; + def SVMLA_FP8_LANE_ZA16_VG4x4 : Inst<"svmla_lane_za32[_mf8]_vg4x4_fpm", "vm4di>", "m", MergeNone, "aarch64_sme_fp8_fmlall_lane_za32_vg4x4", + [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>; + + // FMLAL (mutliple and single) + def SVMLA_FP8_SINGLE_ZA32_VG4x1 : Inst<"svmla[_single]_za32[_mf8]_vg4x1_fpm", "vmdd>", "m", MergeNone, "aarch64_sme_fp8_fmlall_single_za32_vg4x1", + [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>; + def SVMLA_FP8_SINGLE_ZA32_VG4x2 : Inst<"svmla[_single]_za32[_mf8]_vg4x2_fpm", "vm2d>", "m", MergeNone, "aarch64_sme_fp8_fmlall_single_za32_vg4x2", + [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>; + def SVMLA_FP8_SINGLE_ZA32_VG4x4 : Inst<"svmla[_single]_za32[_mf8]_vg4x4_fpm", "vm4d>", "m", MergeNone, "aarch64_sme_fp8_fmlall_single_za32_vg4x4", + [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>; +} + +let SMETargetGuard = "sme-f8f16" in { + def SVMOPA_FP8_ZA16 : Inst<"svmopa_za16[_mf8]_m_fpm", "viPPdd>", "m", MergeNone, "aarch64_sme_fp8_fmopa_za16", + [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<0, ImmCheck0_1>]>; + + // FMLAL (indexed) + def SVMLA_FP8_LANE_ZA16_VG2x1 : Inst<"svmla_lane_za16[_mf8]_vg2x1_fpm", "vmddi>", "m", MergeNone, "aarch64_sme_fp8_fmlal_lane_za16_vg2x1", + [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>; + def SVMLA_FP8_LANE_ZA16_VG2x2 : Inst<"svmla_lane_za16[_mf8]_vg2x2_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fmlal_lane_za16_vg2x2", + [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>; + def SVMLA_FP8_LANE_ZA16_VG2x4 : Inst<"svmla_lane_za16[_mf8]_vg2x4_fpm", "vm4di>", "m", MergeNone, "aarch64_sme_fp8_fmlal_lane_za16_vg2x4", + [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>; + + // FMLAL (mutliple and single) + def SVMLA_FP8_SINGLE_ZA16_VG2x1 : Inst<"svmla[_single]_za16[_mf8]_vg2x1_fpm", "vmdd>", "m", MergeNone, "aarch64_sme_fp8_fmlal_single_za16_vg2x1", + [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>; + def SVMLA_FP8_SINGLE_ZA16_VG2x2 : Inst<"svmla[_single]_za16[_mf8]_vg2x2_fpm", "vm2d>", "m", MergeNone, "aarch64_sme_fp8_fmlal_single_za16_vg2x2", + [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>; + def SVMLA_FP8_SINGLE_ZA16_VG2x4 : Inst<"svmla[_single]_za16[_mf8]_vg2x4_fpm", "vm4d>", "m", MergeNone, "aarch64_sme_fp8_fmlal_single_za16_vg2x4", + [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>; +} + } // let SVETargetGuard = InvalidMode diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index cb9c23b8e0a0d..56595bb4704e7 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -10183,6 +10183,8 @@ CodeGenFunction::getSVEType(const SVETypeFlags &TypeFlags) { case SVETypeFlags::EltTyInt64: return llvm::ScalableVectorType::get(Builder.getInt64Ty(), 2); + case SVETypeFlags::EltTyMFloat8: + return llvm::ScalableVectorType::get(Builder.getInt8Ty(), 16); case SVETypeFlags::EltTyFloat16: return llvm::ScalableVectorType::get(Builder.getHalfTy(), 8); case SVETypeFlags::EltTyBFloat16: @@ -11234,6 +11236,10 @@ Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID, BuiltinID == SME::BI__builtin_sme_svstr_za) return EmitSMELdrStr(TypeFlags, Ops, Builtin->LLVMIntrinsic); + // Emit set FPMR for intrinsics that require it + if (TypeFlags.setsFPMR()) + Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_set_fpmr), + Ops.pop_back_val()); // Handle builtins which require their multi-vector operands to be swapped swapCommutativeSMEOperands(BuiltinID, Ops); diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_fmopa.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_fmopa.c new file mode 100644 index 0000000000000..95d6383ab30ef --- /dev/null +++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_fmopa.c @@ -0,0 +1,55 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3 +#endif + + +// CHECK-LABEL: define dso_local void @test_svmopa_za16_mf8_m( +// CHECK-SAME: [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmopa.za16(i32 1, [[PN]], [[PM]], [[ZN]], [[ZM]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: define dso_local void @_Z22test_svmopa_za16_mf8_mu10__SVBool_tS_u13__SVMfloat8_tS0_m( +// CPP-CHECK-SAME: [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmopa.za16(i32 1, [[PN]], [[PM]], [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmopa_za16_mf8_m(svbool_t pn, svbool_t pm, svmfloat8_t zn, + svmfloat8_t zm, fpm_t fpmr) __arm_streaming __arm_inout("za") { + SVE_ACLE_FUNC(svmopa_za16,_mf8,_m_fpm)(1, pn, pm, zn, zm, fpmr); +} + +// CHECK-LABEL: define dso_local void @test_svmopa_za32_mf8_m( +// CHECK-SAME: [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmopa.za32(i32 3, [[PN]], [[PM]], [[ZN]], [[ZM]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: define dso_local void @_Z22test_svmopa_za32_mf8_mu10__SVBool_tS_u13__SVMfloat8_tS0_m( +// CPP-CHECK-SAME: [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmopa.za32(i32 3, [[PN]], [[PM]], [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmopa_za32_mf8_m(svbool_t pn, svbool_t pm, svmfloat8_t zn, + svmfloat8_t zm, fpm_t fpmr) __arm_streaming __arm_inout("za") { + SVE_ACLE_FUNC(svmopa_za32,_mf8,_m_fpm)(3, pn, pm, zn, zm, fpmr); +} diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_mla.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_mla.c new file mode 100644 index 0000000000000..85870f5e4c353 --- /dev/null +++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_mla.c @@ -0,0 +1,236 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SME_OVERLOADED_FORMS +#define SME_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED, A5) A1##A3##A5 +#else +#define SME_ACLE_FUNC(A1, A2, A3, A4, A5) A1##A2##A3##A4##A5 +#endif + +// FMLAL (indexed) + +// CHECK-LABEL: define dso_local void @test_svmla_lane_za16_vg2x1( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.lane.za16.vg2x1(i32 [[SLICE]], [[ZN]], [[ZM]], i32 0) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: define dso_local void @_Z26test_svmla_lane_za16_vg2x1ju13__SVMfloat8_tS_m( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.lane.za16.vg2x1(i32 [[SLICE]], [[ZN]], [[ZM]], i32 0) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_lane_za16_vg2x1(uint32_t slice, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmla_lane_za16,_mf8,_vg2x1_fpm,,)(slice, zn, zm, 0, fpm); +} + +// CHECK-LABEL: define dso_local void @test_svmla_lane_za16_vg2x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.lane.za16.vg2x2(i32 [[SLICE]], [[ZN_COERCE0]], [[ZN_COERCE1]], [[ZM]], i32 15) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: define dso_local void @_Z26test_svmla_lane_za16_vg2x2j13svmfloat8x2_tu13__SVMfloat8_tm( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.lane.za16.vg2x2(i32 [[SLICE]], [[ZN_COERCE0]], [[ZN_COERCE1]], [[ZM]], i32 15) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_lane_za16_vg2x2(uint32_t slice, svmfloat8x2_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmla_lane_za16,_mf8,_vg2x2_fpm,,)(slice, zn, zm, 15, fpm); +} + +// CHECK-LABEL: define dso_local void @test_svmla_lane_za16_vg2x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZN_COERCE2:%.*]], [[ZN_COERCE3:%.*]], [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.lane.za16.vg2x4(i32 [[SLICE]], [[ZN_COERCE0]], [[ZN_COERCE1]], [[ZN_COERCE2]], [[ZN_COERCE3]], [[ZM]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: define dso_local void @_Z26test_svmla_lane_za16_vg2x4j13svmfloat8x4_tu13__SVMfloat8_tm( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZN_COERCE2:%.*]], [[ZN_COERCE3:%.*]], [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.lane.za16.vg2x4(i32 [[SLICE]], [[ZN_COERCE0]], [[ZN_COERCE1]], [[ZN_COERCE2]], [[ZN_COERCE3]], [[ZM]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_lane_za16_vg2x4(uint32_t slice, svmfloat8x4_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmla_lane_za16,_mf8,_vg2x4_fpm,,)(slice, zn, zm, 7, fpm); +} + +// FMLALL (indexed) + +// CHECK-LABEL: define dso_local void @test_svmla_lane_za32_vg4x1( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.lane.za32.vg4x1(i32 [[SLICE]], [[ZN]], [[ZM]], i32 0) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: define dso_local void @_Z26test_svmla_lane_za32_vg4x1ju13__SVMfloat8_tS_m( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.lane.za32.vg4x1(i32 [[SLICE]], [[ZN]], [[ZM]], i32 0) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_lane_za32_vg4x1(uint32_t slice, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmla_lane_za32,_mf8,_vg4x1_fpm,,)(slice, zn, zm, 0, fpm); +} + +// CHECK-LABEL: define dso_local void @test_svmla_lane_za32_vg4x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.lane.za32.vg4x2(i32 [[SLICE]], [[ZN_COERCE0]], [[ZN_COERCE1]], [[ZM]], i32 15) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: define dso_local void @_Z26test_svmla_lane_za32_vg4x2j13svmfloat8x2_tu13__SVMfloat8_tm( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.lane.za32.vg4x2(i32 [[SLICE]], [[ZN_COERCE0]], [[ZN_COERCE1]], [[ZM]], i32 15) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_lane_za32_vg4x2(uint32_t slice, svmfloat8x2_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmla_lane_za32,_mf8,_vg4x2_fpm,,)(slice, zn, zm, 15, fpm); +} + +// CHECK-LABEL: define dso_local void @test_svmla_lane_za32_vg4x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZN_COERCE2:%.*]], [[ZN_COERCE3:%.*]], [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.lane.za32.vg4x4(i32 [[SLICE]], [[ZN_COERCE0]], [[ZN_COERCE1]], [[ZN_COERCE2]], [[ZN_COERCE3]], [[ZM]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: define dso_local void @_Z26test_svmla_lane_za32_vg4x4j13svmfloat8x4_tu13__SVMfloat8_tm( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZN_COERCE2:%.*]], [[ZN_COERCE3:%.*]], [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.lane.za32.vg4x4(i32 [[SLICE]], [[ZN_COERCE0]], [[ZN_COERCE1]], [[ZN_COERCE2]], [[ZN_COERCE3]], [[ZM]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_lane_za32_vg4x4(uint32_t slice, svmfloat8x4_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmla_lane_za32,_mf8,_vg4x4_fpm,,)(slice, zn, zm, 7, fpm); +} + +// CHECK-LABEL: define dso_local void @test_svmla_single_za16_vg2x1( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.single.za16.vg2x1(i32 [[SLICE]], [[ZN]], [[ZM]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: define dso_local void @_Z28test_svmla_single_za16_vg2x1ju13__SVMfloat8_tS_m( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.single.za16.vg2x1(i32 [[SLICE]], [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single_za16_vg2x1(uint32_t slice, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmla,_single,_za16,_mf8,_vg2x1_fpm)(slice, zn, zm, fpm); +} + +// CHECK-LABEL: define dso_local void @test_svmla_single_za16_vg2x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.single.za16.vg2x2(i32 [[SLICE]], [[ZN_COERCE0]], [[ZN_COERCE1]], [[ZM]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: define dso_local void @_Z28test_svmla_single_za16_vg2x2j13svmfloat8x2_tu13__SVMfloat8_tm( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.single.za16.vg2x2(i32 [[SLICE]], [[ZN_COERCE0]], [[ZN_COERCE1]], [[ZM]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single_za16_vg2x2(uint32_t slice, svmfloat8x2_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmla,_single,_za16,_mf8,_vg2x2_fpm)(slice, zn, zm, fpm); +} + +// CHECK-LABEL: define dso_local void @test_svmla_single_za16_vg2x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZN_COERCE2:%.*]], [[ZN_COERCE3:%.*]], [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.single.za16.vg2x4(i32 [[SLICE]], [[ZN_COERCE0]], [[ZN_COERCE1]], [[ZN_COERCE2]], [[ZN_COERCE3]], [[ZM]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: define dso_local void @_Z28test_svmla_single_za16_vg2x4j13svmfloat8x4_tu13__SVMfloat8_tm( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZN_COERCE2:%.*]], [[ZN_COERCE3:%.*]], [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.single.za16.vg2x4(i32 [[SLICE]], [[ZN_COERCE0]], [[ZN_COERCE1]], [[ZN_COERCE2]], [[ZN_COERCE3]], [[ZM]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single_za16_vg2x4(uint32_t slice, svmfloat8x4_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmla,_single,_za16,_mf8,_vg2x4_fpm)(slice, zn, zm, fpm); +} + +// CHECK-LABEL: define dso_local void @test_svmla_single_za32_vg4x1( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.single.za32.vg4x1(i32 [[SLICE]], [[ZN]], [[ZM]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: define dso_local void @_Z28test_svmla_single_za32_vg4x1ju13__SVMfloat8_tS_m( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.single.za32.vg4x1(i32 [[SLICE]], [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single_za32_vg4x1(uint32_t slice, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmla,_single,_za32,_mf8,_vg4x1_fpm)(slice, zn, zm, fpm); +} + +// CHECK-LABEL: define dso_local void @test_svmla_single_za32_vg4x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.single.za32.vg4x2(i32 [[SLICE]], [[ZN_COERCE0]], [[ZN_COERCE1]], [[ZM]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: define dso_local void @_Z28test_svmla_single_za32_vg4x2j13svmfloat8x2_tu13__SVMfloat8_tm( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.single.za32.vg4x2(i32 [[SLICE]], [[ZN_COERCE0]], [[ZN_COERCE1]], [[ZM]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single_za32_vg4x2(uint32_t slice, svmfloat8x2_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmla,_single,_za32,_mf8,_vg4x2_fpm)(slice, zn, zm, fpm); +} + +// CHECK-LABEL: define dso_local void @test_svmla_single_za32_vg4x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZN_COERCE2:%.*]], [[ZN_COERCE3:%.*]], [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.single.za32.vg4x4(i32 [[SLICE]], [[ZN_COERCE0]], [[ZN_COERCE1]], [[ZN_COERCE2]], [[ZN_COERCE3]], [[ZM]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: define dso_local void @_Z28test_svmla_single_za32_vg4x4j13svmfloat8x4_tu13__SVMfloat8_tm( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZN_COERCE2:%.*]], [[ZN_COERCE3:%.*]], [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.single.za32.vg4x4(i32 [[SLICE]], [[ZN_COERCE0]], [[ZN_COERCE1]], [[ZN_COERCE2]], [[ZN_COERCE3]], [[ZM]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single_za32_vg4x4(uint32_t slice, svmfloat8x4_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmla,_single,_za32,_mf8,_vg4x4_fpm)(slice, zn, zm, fpm); +} diff --git a/clang/test/Sema/aarch64-fp8-intrinsics/acle_sme2_fp8_imm.c b/clang/test/Sema/aarch64-fp8-intrinsics/acle_sme2_fp8_imm.c new file mode 100644 index 0000000000000..c60b179a43e15 --- /dev/null +++ b/clang/test/Sema/aarch64-fp8-intrinsics/acle_sme2_fp8_imm.c @@ -0,0 +1,52 @@ +// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -fsyntax-only -verify %s + +// REQUIRES: aarch64-registered-target + +#include + +void test_svmopa(svbool_t pn, svbool_t pm, svmfloat8_t zn, svmfloat8_t zm, + fpm_t fpmr) __arm_streaming __arm_inout("za") { + // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 1]}} + svmopa_za16_mf8_m_fpm(-1, pn, pm, zn, zm, fpmr); + // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}} + svmopa_za16_mf8_m_fpm(2, pn, pm, zn, zm, fpmr); + + // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmopa_za32_mf8_m_fpm(-1, pn, pm, zn, zm, fpmr); + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svmopa_za32_mf8_m_fpm(4, pn, pm, zn, zm, fpmr); +} + +void test_svmla(uint32_t slice, svmfloat8_t zn, svmfloat8x2_t znx2, svmfloat8x4_t znx4, + fpm_t fpmr) __arm_streaming __arm_inout("za") { + // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 15]}} + svmla_lane_za16_mf8_vg2x1_fpm(slice, zn, zn, -1, fpmr); + // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} + svmla_lane_za16_mf8_vg2x1_fpm(slice, zn, zn, 16, fpmr); + + // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 15]}} + svmla_lane_za16_mf8_vg2x2_fpm(slice, znx2, zn, -1, fpmr); + // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} + svmla_lane_za16_mf8_vg2x2_fpm(slice, znx2, zn, 16, fpmr); + + // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 15]}} + svmla_lane_za16_mf8_vg2x4_fpm(slice, znx4, zn, -1, fpmr); + // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} + svmla_lane_za16_mf8_vg2x4_fpm(slice, znx4, zn, 16, fpmr); + + // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 15]}} + svmla_lane_za32_mf8_vg4x1_fpm(slice, zn, zn, -1, fpmr); + // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} + svmla_lane_za32_mf8_vg4x1_fpm(slice, zn, zn, 16, fpmr); + + // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 15]}} + svmla_lane_za32_mf8_vg4x2_fpm(slice, znx2, zn, -1, fpmr); + // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} + svmla_lane_za32_mf8_vg4x2_fpm(slice, znx2, zn, 16, fpmr); + + // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 15]}} + svmla_lane_za32_mf8_vg4x4_fpm(slice, znx4, zn, -1, fpmr); + // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} + svmla_lane_za32_mf8_vg4x4_fpm(slice, znx4, zn, 16, fpmr); + +} diff --git a/clang/test/Sema/aarch64-fp8-intrinsics/acle_sme2_fp8_mopa.c b/clang/test/Sema/aarch64-fp8-intrinsics/acle_sme2_fp8_mopa.c new file mode 100644 index 0000000000000..86426abcd4329 --- /dev/null +++ b/clang/test/Sema/aarch64-fp8-intrinsics/acle_sme2_fp8_mopa.c @@ -0,0 +1,13 @@ +// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -verify -emit-llvm-only %s + +// REQUIRES: aarch64-registered-target + +#include + +void test_features(svbool_t pn, svbool_t pm, svmfloat8_t zn, svmfloat8_t zm, + fpm_t fpmr) __arm_streaming __arm_inout("za") { + // expected-error@+1 {{'svmopa_za16_mf8_m_fpm' needs target feature sme,sme-f8f16}} + svmopa_za16_mf8_m_fpm(0, pn, pm, zn, zm, fpmr); + // expected-error@+1 {{'svmopa_za32_mf8_m_fpm' needs target feature sme,sme-f8f32}} + svmopa_za32_mf8_m_fpm(0, pn, pm, zn, zm, fpmr); +} diff --git a/clang/test/Sema/aarch64-fp8-intrinsics/acle_sme_fp8_mla.c b/clang/test/Sema/aarch64-fp8-intrinsics/acle_sme_fp8_mla.c new file mode 100644 index 0000000000000..9204ca0ae5d4c --- /dev/null +++ b/clang/test/Sema/aarch64-fp8-intrinsics/acle_sme_fp8_mla.c @@ -0,0 +1,44 @@ +// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -verify -emit-llvm-only %s + +// REQUIRES: aarch64-registered-target + +#include + +void test_svmla(uint32_t slice, svmfloat8_t zn, svmfloat8x2_t znx2, svmfloat8x4_t znx4, + fpm_t fpmr) __arm_streaming __arm_inout("za") { + // expected-error@+1 {{'svmla_lane_za16_mf8_vg2x1_fpm' needs target feature sme,sme-f8f16}} + svmla_lane_za16_mf8_vg2x1_fpm(slice, zn, zn, 0, fpmr); + + // expected-error@+1 {{'svmla_lane_za16_mf8_vg2x2_fpm' needs target feature sme,sme-f8f16}} + svmla_lane_za16_mf8_vg2x2_fpm(slice, znx2, zn, 0, fpmr); + + // expected-error@+1 {{'svmla_lane_za16_mf8_vg2x4_fpm' needs target feature sme,sme-f8f16}} + svmla_lane_za16_mf8_vg2x4_fpm(slice, znx4, zn, 0, fpmr); + + // expected-error@+1 {{'svmla_lane_za32_mf8_vg4x1_fpm' needs target feature sme,sme-f8f32}} + svmla_lane_za32_mf8_vg4x1_fpm(slice, zn, zn, 0, fpmr); + + // expected-error@+1 {{'svmla_lane_za32_mf8_vg4x2_fpm' needs target feature sme,sme-f8f32}} + svmla_lane_za32_mf8_vg4x2_fpm(slice, znx2, zn, 0, fpmr); + + // expected-error@+1 {{'svmla_lane_za32_mf8_vg4x4_fpm' needs target feature sme,sme-f8f32}} + svmla_lane_za32_mf8_vg4x4_fpm(slice, znx4, zn, 0, fpmr); + + // expected-error@+1 {{'svmla_single_za16_mf8_vg2x1_fpm' needs target feature sme,sme-f8f16}} + svmla_single_za16_mf8_vg2x1_fpm(slice, zn, zn, fpmr); + + // expected-error@+1 {{'svmla_single_za16_mf8_vg2x2_fpm' needs target feature sme,sme-f8f16}} + svmla_single_za16_mf8_vg2x2_fpm(slice, znx2, zn, fpmr); + + // expected-error@+1 {{'svmla_single_za16_mf8_vg2x4_fpm' needs target feature sme,sme-f8f16}} + svmla_single_za16_mf8_vg2x4_fpm(slice, znx4, zn, fpmr); + + // expected-error@+1 {{'svmla_single_za32_mf8_vg4x1_fpm' needs target feature sme,sme-f8f32}} + svmla_single_za32_mf8_vg4x1_fpm(slice, zn, zn, fpmr); + + // expected-error@+1 {{'svmla_single_za32_mf8_vg4x2_fpm' needs target feature sme,sme-f8f32}} + svmla_single_za32_mf8_vg4x2_fpm(slice, znx2, zn, fpmr); + + // expected-error@+1 {{'svmla_single_za32_mf8_vg4x4_fpm' needs target feature sme,sme-f8f32}} + svmla_single_za32_mf8_vg4x4_fpm(slice, znx4, zn, fpmr); +} \ No newline at end of file diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index a91616b955682..af268e6f08489 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -2983,6 +2983,13 @@ let TargetPrefix = "aarch64" in { LLVMMatchType<0>, llvm_anyvector_ty], [ImmArg>]>; + class SME_FP8_OuterProduct_Intrinsic + : DefaultAttrsIntrinsic<[], + [llvm_i32_ty, + llvm_nxv16i1_ty, llvm_nxv16i1_ty, + llvm_nxv16i8_ty, llvm_nxv16i8_ty], + [ImmArg>, IntrInaccessibleMemOnly, IntrHasSideEffects]>; + def int_aarch64_sme_mopa : SME_OuterProduct_Intrinsic; def int_aarch64_sme_mops : SME_OuterProduct_Intrinsic; @@ -2998,6 +3005,10 @@ let TargetPrefix = "aarch64" in { def int_aarch64_sme_usmopa_wide : SME_OuterProduct_Intrinsic; def int_aarch64_sme_usmops_wide : SME_OuterProduct_Intrinsic; + // FP8 outer product + def int_aarch64_sme_fp8_fmopa_za16 : SME_FP8_OuterProduct_Intrinsic; + def int_aarch64_sme_fp8_fmopa_za32 : SME_FP8_OuterProduct_Intrinsic; + class SME_AddVectorToTile_Intrinsic : DefaultAttrsIntrinsic<[], [llvm_i32_ty, @@ -3864,3 +3875,66 @@ def int_aarch64_sve_famin_u : AdvSIMD_Pred2VectorArg_Intrinsic; // Neon absolute maximum and minimum def int_aarch64_neon_famax : AdvSIMD_2VectorArg_Intrinsic; def int_aarch64_neon_famin : AdvSIMD_2VectorArg_Intrinsic; + +// +// FP8 intrinsics + +let TargetPrefix = "aarch64" in { + +class SME_FP8_ZA_LANE_VGx1 : + DefaultAttrsIntrinsic<[], [llvm_i32_ty, + llvm_nxv16i8_ty, + llvm_nxv16i8_ty, + llvm_i32_ty], + [IntrInaccessibleMemOnly, IntrHasSideEffects, ImmArg>]>; + +class SME_FP8_ZA_LANE_VGx2 : + DefaultAttrsIntrinsic<[], [llvm_i32_ty, + llvm_nxv16i8_ty, llvm_nxv16i8_ty, + llvm_nxv16i8_ty, + llvm_i32_ty], + [IntrInaccessibleMemOnly, IntrHasSideEffects, ImmArg>]>; + +class SME_FP8_ZA_LANE_VGx4 : + DefaultAttrsIntrinsic<[], [llvm_i32_ty, + llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, + llvm_nxv16i8_ty, + llvm_i32_ty], + [IntrInaccessibleMemOnly, IntrHasSideEffects, ImmArg>]>; + +class SME_FP8_ZA_SINGLE_VGx1 : + DefaultAttrsIntrinsic<[], [llvm_i32_ty, + llvm_nxv16i8_ty, + llvm_nxv16i8_ty], + [IntrInaccessibleMemOnly, IntrHasSideEffects]>; + +class SME_FP8_ZA_SINGLE_VGx2 : + DefaultAttrsIntrinsic<[], [llvm_i32_ty, + llvm_nxv16i8_ty, llvm_nxv16i8_ty, + llvm_nxv16i8_ty], + [IntrInaccessibleMemOnly, IntrHasSideEffects]>; + +class SME_FP8_ZA_SINGLE_VGx4 : + DefaultAttrsIntrinsic<[], [llvm_i32_ty, + llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, + llvm_nxv16i8_ty], + [IntrInaccessibleMemOnly, IntrHasSideEffects]>; + + // Double-vector groups (F8F16) + def int_aarch64_sme_fp8_fmlal_lane_za16_vg2x1 : SME_FP8_ZA_LANE_VGx1; + def int_aarch64_sme_fp8_fmlal_lane_za16_vg2x2 : SME_FP8_ZA_LANE_VGx2; + def int_aarch64_sme_fp8_fmlal_lane_za16_vg2x4 : SME_FP8_ZA_LANE_VGx4; + + def int_aarch64_sme_fp8_fmlal_single_za16_vg2x1 : SME_FP8_ZA_SINGLE_VGx1; + def int_aarch64_sme_fp8_fmlal_single_za16_vg2x2 : SME_FP8_ZA_SINGLE_VGx2; + def int_aarch64_sme_fp8_fmlal_single_za16_vg2x4 : SME_FP8_ZA_SINGLE_VGx4; + + // Quad-vector groups (F8F32) + def int_aarch64_sme_fp8_fmlall_lane_za32_vg4x1 : SME_FP8_ZA_LANE_VGx1; + def int_aarch64_sme_fp8_fmlall_lane_za32_vg4x2 : SME_FP8_ZA_LANE_VGx2; + def int_aarch64_sme_fp8_fmlall_lane_za32_vg4x4 : SME_FP8_ZA_LANE_VGx4; + + def int_aarch64_sme_fp8_fmlall_single_za32_vg4x1 : SME_FP8_ZA_SINGLE_VGx1; + def int_aarch64_sme_fp8_fmlall_single_za32_vg4x2 : SME_FP8_ZA_SINGLE_VGx2; + def int_aarch64_sme_fp8_fmlall_single_za32_vg4x4 : SME_FP8_ZA_SINGLE_VGx4; +} \ No newline at end of file diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index 37ac915d1d880..781faed53555c 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -990,50 +990,49 @@ defm FDOT_VG2_M2ZZI_BtoH : sme2p1_multi_vec_array_vg2_index_f8f16<"fdot", 0b11 defm FDOT_VG4_M4ZZI_BtoH : sme2p1_multi_vec_array_vg4_index_f8f16<"fdot", 0b100, ZZZZ_b_mul_r, ZPR4b8>; defm FDOT_VG2_M2ZZ_BtoH : sme2_dot_mla_add_sub_array_vg24_single<"fdot", 0b0010001, MatrixOp16, ZZ_b, ZPR4b8>; defm FDOT_VG4_M4ZZ_BtoH : sme2_dot_mla_add_sub_array_vg24_single<"fdot", 0b0110001, MatrixOp16, ZZZZ_b, ZPR4b8>; -// TODO: Replace nxv16i8 by nxv16f8 + defm FDOT_VG2_M2Z2Z_BtoH : sme2_dot_mla_add_sub_array_vg2_multi<"fdot", 0b0100100, MatrixOp16, ZZ_b_mul_r, nxv16i8, null_frag>; defm FDOT_VG4_M4Z4Z_BtoH : sme2_dot_mla_add_sub_array_vg4_multi<"fdot", 0b0100100, MatrixOp16, ZZZZ_b_mul_r, nxv16i8, null_frag>; -def FMLAL_MZZI_BtoH : sme2_mla_ll_array_index_16b<"fmlal", 0b11, 0b00>; -defm FMLAL_VG2_M2ZZI_BtoH : sme2_multi_vec_array_vg2_index_16b<"fmlal", 0b10, 0b111>; -defm FMLAL_VG4_M4ZZI_BtoH : sme2_multi_vec_array_vg4_index_16b<"fmlal", 0b10, 0b110>; -def FMLAL_VG2_MZZ_BtoH : sme2_mla_long_array_single_16b<"fmlal">; -// TODO: Replace nxv16i8 by nxv16f8 -defm FMLAL_VG2_M2ZZ_BtoH : sme2_fp_mla_long_array_vg2_single<"fmlal", 0b001, MatrixOp16, ZZ_b, ZPR4b8, nxv16i8, null_frag>; -defm FMLAL_VG4_M4ZZ_BtoH : sme2_fp_mla_long_array_vg4_single<"fmlal", 0b001, MatrixOp16, ZZZZ_b, ZPR4b8, nxv16i8, null_frag>; +defm FMLAL_MZZI_BtoH : sme2_fp8_fmlal_indexed_za16<"fmlal", int_aarch64_sme_fp8_fmlal_lane_za16_vg2x1>; +defm FMLAL_VG2_M2ZZI_BtoH : sme2_fp8_fmlal_indexed_za16_vgx2<"fmlal", int_aarch64_sme_fp8_fmlal_lane_za16_vg2x2>; +defm FMLAL_VG4_M4ZZI_BtoH : sme2_fp8_fmlal_indexed_za16_vgx4<"fmlal", int_aarch64_sme_fp8_fmlal_lane_za16_vg2x4>; + +defm FMLAL_VG2_MZZ_BtoH : sme2_fp8_fmlal_single_za16<"fmlal", int_aarch64_sme_fp8_fmlal_single_za16_vg2x1>; +defm FMLAL_VG2_M2ZZ_BtoH : sme2_fp8_fmlal_single_za16_vgx2<"fmlal", int_aarch64_sme_fp8_fmlal_single_za16_vg2x2>; +defm FMLAL_VG4_M4ZZ_BtoH : sme2_fp8_fmlal_single_za16_vgx4<"fmlal", int_aarch64_sme_fp8_fmlal_single_za16_vg2x4>; + defm FMLAL_VG2_M2Z2Z_BtoH : sme2_fp_mla_long_array_vg2_multi<"fmlal", 0b100, MatrixOp16, ZZ_b_mul_r, nxv16i8, null_frag>; defm FMLAL_VG4_M4Z4Z_BtoH : sme2_fp_mla_long_array_vg4_multi<"fmlal", 0b100, MatrixOp16, ZZZZ_b_mul_r, nxv16i8, null_frag>; -defm FMOPA_MPPZZ_BtoH : sme2p1_fmop_tile_f8f16<"fmopa", 0b1, 0b0, 0b01>; - +defm FMOPA_MPPZZ_BtoH : sme2_fp8_fmopa_za16<"fmopa", int_aarch64_sme_fp8_fmopa_za16>; } //[HasSMEF8F16] let Predicates = [HasSMEF8F32] in { -// TODO : Replace nxv16i8 by nxv16f8 + defm FDOT_VG2_M2ZZI_BtoS : sme2_multi_vec_array_vg2_index_32b<"fdot", 0b01, 0b0111, ZZ_b_mul_r, ZPR4b8, nxv16i8, null_frag>; defm FDOT_VG4_M4ZZI_BtoS : sme2_multi_vec_array_vg4_index_32b<"fdot", 0b0001, ZZZZ_b_mul_r, ZPR4b8, nxv16i8, null_frag>; defm FDOT_VG2_M2ZZ_BtoS : sme2_dot_mla_add_sub_array_vg24_single<"fdot", 0b0010011, MatrixOp32, ZZ_b, ZPR4b8>; defm FDOT_VG4_M4ZZ_BtoS : sme2_dot_mla_add_sub_array_vg24_single<"fdot", 0b0110011, MatrixOp32, ZZZZ_b, ZPR4b8>; -// TODO : Replace nxv16i8 by nxv16f8 + defm FDOT_VG2_M2Z2Z_BtoS : sme2_dot_mla_add_sub_array_vg2_multi<"fdot", 0b0100110, MatrixOp32, ZZ_b_mul_r, nxv16i8, null_frag>; defm FDOT_VG4_M4Z4Z_BtoS : sme2_dot_mla_add_sub_array_vg4_multi<"fdot", 0b0100110, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, null_frag>; def FVDOTB_VG4_M2ZZI_BtoS : sme2_fp8_multi_vec_array_vg4_index<"fvdotb", 0b0>; def FVDOTT_VG4_M2ZZI_BtoS : sme2_fp8_multi_vec_array_vg4_index<"fvdott", 0b1>; -defm FMLALL_MZZI_BtoS : sme2_mla_ll_array_index_32b<"fmlall", 0b01, 0b000, null_frag>; -defm FMLALL_VG2_M2ZZI_BtoS : sme2_mla_ll_array_vg2_index_32b<"fmlall", 0b10, 0b100, null_frag>; -defm FMLALL_VG4_M4ZZI_BtoS : sme2_mla_ll_array_vg4_index_32b<"fmlall", 0b00, 0b1000, null_frag>; -// TODO: Replace nxv16i8 by nxv16f8 -defm FMLALL_MZZ_BtoS : sme2_mla_ll_array_single<"fmlall", 0b01000, MatrixOp32, ZPR8, ZPR4b8, nxv16i8, null_frag>; -defm FMLALL_VG2_M2ZZ_BtoS : sme2_mla_ll_array_vg24_single<"fmlall", 0b000001, MatrixOp32, ZZ_b, ZPR4b8>; -defm FMLALL_VG4_M4ZZ_BtoS : sme2_mla_ll_array_vg24_single<"fmlall", 0b010001, MatrixOp32, ZZZZ_b, ZPR4b8>; -defm FMLALL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"fmlall", 0b01000, MatrixOp32, ZZ_b_mul_r, nxv16i8, null_frag>; -defm FMLALL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"fmlall", 0b01000, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, null_frag>; +defm FMLALL_MZZI_BtoS : sme2_fp8_fmlall_indexed_za32<"fmlall", int_aarch64_sme_fp8_fmlall_lane_za32_vg4x1>; +defm FMLALL_VG2_M2ZZI_BtoS : sme2_fp8_fmlall_indexed_za32_vgx2<"fmlall", int_aarch64_sme_fp8_fmlall_lane_za32_vg4x2>; +defm FMLALL_VG4_M4ZZI_BtoS : sme2_fp8_fmlall_indexed_za32_vgx4<"fmlall", int_aarch64_sme_fp8_fmlall_lane_za32_vg4x4>; +defm FMLALL_MZZ_BtoS : sme2_fp8_fmlall_single_za32<"fmlall", int_aarch64_sme_fp8_fmlall_single_za32_vg4x1>; +defm FMLALL_VG2_M2ZZ_BtoS : sme2_fp8_fmlall_single_za32_vgx2<"fmlall", int_aarch64_sme_fp8_fmlall_single_za32_vg4x2>; +defm FMLALL_VG4_M4ZZ_BtoS : sme2_fp8_fmlall_single_za32_vgx4<"fmlall", int_aarch64_sme_fp8_fmlall_single_za32_vg4x4>; -defm FMOPA_MPPZZ_BtoS : sme_outer_product_fp32<0b0, 0b01, ZPR8, "fmopa", null_frag>; +defm FMLALL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"fmlall", 0b01000, MatrixOp32, ZZ_b_mul_r, nxv16i8, null_frag>; +defm FMLALL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"fmlall", 0b01000, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, null_frag>; +defm FMOPA_MPPZZ_BtoS : sme2_fp8_fmopa_za32<"fmopa", int_aarch64_sme_fp8_fmopa_za32>; } //[HasSMEF8F32] let Predicates = [HasSME2, HasSVEBFSCALE] in { diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td index 776472e72af05..331653229f12c 100644 --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -305,6 +305,21 @@ multiclass sme_outer_product_fp32 sz, ZPRRegOp zpr_ty, string mne def : SME_ZA_Tile_TwoPred_TwoVec_Pat; } +multiclass sme2_fp8_fmopa_za32 { + def NAME : sme_fp_outer_product_inst<0, 0b01, 0b00, TileOp32, ZPR8, mnemonic>, SMEPseudo2Instr { + bits<2> ZAda; + let Inst{1-0} = ZAda; + let Inst{2} = 0b0; + + let Uses = [FPMR, FPCR]; + } + + let mayStore = 1, mayLoad = 1 in + def NAME # _PSEUDO : sme_outer_product_pseudo, SMEPseudo2Instr; + + def : SME_ZA_Tile_TwoPred_TwoVec_Pat; +} + multiclass sme_outer_product_fp64 { def NAME : sme_fp_outer_product_inst, SMEPseudo2Instr { bits<3> ZAda; @@ -316,12 +331,19 @@ multiclass sme_outer_product_fp64 def : SME_ZA_Tile_TwoPred_TwoVec_Pat; } -multiclass sme2p1_fmop_tile_f8f16 op> { - def NAME : sme_fp_outer_product_inst { +multiclass sme2_fp8_fmopa_za16 { + def NAME : sme_fp_outer_product_inst<0, {0, 0b1}, 0b01, TileOp16, ZPR8, mnemonic>, SMEPseudo2Instr { bits<1> ZAda; let Inst{2-1} = 0b00; let Inst{0} = ZAda; + + let Uses = [FPMR, FPCR]; } + + let mayStore = 1, mayLoad = 1 in + def NAME # _PSEUDO : sme_outer_product_pseudo, SMEPseudo2Instr; + + def : SME_ZA_Tile_TwoPred_TwoVec_Pat; } multiclass sme2p1_fmop_tile_fp16 { @@ -2112,6 +2134,15 @@ class sme2_mla_long_array_single_16b let Inst{2-0} = imm; } +multiclass sme2_fp8_fmlal_single_za16 { + let Uses = [FPMR, FPCR] in + def NAME : sme2_mla_long_array_single_16b, SMEPseudo2Instr; + + def _PSEUDO : sme2_za_array_2op_multi_single_pseudo; + + def: SME2_ZA_TwoOp_Multi_Single_Pat; +} + class sme2_mla_long_array_vg24_single op0, bit vg4, bits<2> op, bit o2, MatrixOperand matrix_ty, RegisterOperand multi_vector_ty, ZPRRegOp zpr_ty, string mnemonic, string vg_acronym> @@ -2127,6 +2158,34 @@ class sme2_mla_long_array_vg24_single op0, bit vg4, bits<2> op, bit o2, let Inst{1-0} = imm; } +multiclass sme2_fp8_fmlal_single_za16_vgx2 { + let Uses = [FPMR, FPCR] in + def NAME : sme2_mla_long_array_vg24_single<0b00, 0b0, 0b00, 0b1, MatrixOp16, ZZ_b, ZPR4b8, + mnemonic, "vgx2">, SMEPseudo2Instr; + + def _PSEUDO : sme2_za_array_2op_multi_single_pseudo; + + def : SME2_ZA_TwoOp_VG2_Multi_Single_Pat; + + def : InstAlias(NAME) MatrixOp16:$ZAda, MatrixIndexGPR32Op8_11:$Rv, + uimm2s2range:$imm, ZZ_b:$Zn, ZPR4b8:$Zm), 0>; +} + +multiclass sme2_fp8_fmlal_single_za16_vgx4 { + let Uses = [FPMR, FPCR] in + def NAME : sme2_mla_long_array_vg24_single<0b00, 0b1, 0b00, 0b1, MatrixOp16, ZZZZ_b, ZPR4b8, + mnemonic, "vgx4">, SMEPseudo2Instr; + + def _PSEUDO : sme2_za_array_2op_multi_single_pseudo; + + def : SME2_ZA_TwoOp_VG4_Multi_Single_Pat; + + def : InstAlias(NAME) MatrixOp16:$ZAda, MatrixIndexGPR32Op8_11:$Rv, + uimm2s2range:$imm, ZZZZ_b:$Zn, ZPR4b8:$Zm), 0>; +} + multiclass sme2_fp_mla_long_array_vg2_single op, MatrixOperand matrix_ty, RegisterOperand multi_vector_ty, ZPRRegOp vector_ty, ValueType zpr_ty, SDPatternOperator intrinsic> { @@ -2863,7 +2922,7 @@ class sme2_multi_vec_array_vg24_index_16b sz, bit vg4, bits<3> op, RegisterOperand multi_vector_ty, string mnemonic> : I<(outs MatrixOp16:$ZAda), (ins MatrixOp16:$_ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm2, - multi_vector_ty:$Zn, ZPR4b8:$Zm, VectorIndexB:$i), + multi_vector_ty:$Zn, ZPR4b8:$Zm, VectorIndexB32b_timm:$i), mnemonic, "\t$ZAda[$Rv, $imm2, " # !if(vg4, "vgx4", "vgx2") # "], $Zn, $Zm$i", "", []>, Sched<[]> { bits<4> Zm; @@ -2882,33 +2941,44 @@ class sme2_multi_vec_array_vg24_index_16b sz, bit vg4, bits<3> op, let Inst{3-2} = i{1-0}; let Inst{1-0} = imm2; + let Uses = [FPMR, FPCR]; let Constraints = "$ZAda = $_ZAda"; } -multiclass sme2_multi_vec_array_vg2_index_16b sz, bits<3>op> { - def NAME : sme2_multi_vec_array_vg24_index_16b { +multiclass sme2_fp8_fmlal_indexed_za16_vgx2 { + def NAME : sme2_multi_vec_array_vg24_index_16b<0b10, 0b0, 0b111, ZZ_b_mul_r, mnemonic>, SMEPseudo2Instr { bits<4> Zn; let Inst{9-6} = Zn; - } - def : InstAlias(NAME) MatrixOp16:$ZAda, MatrixIndexGPR32Op8_11:$Rv, - uimm2s2range:$imm2, ZZ_b_mul_r:$Zn, ZPR4b8:$Zm, VectorIndexB:$i), 0>; + } + + def _PSEUDO : sme2_za_array_2op_multi_index_pseudo; + + def : SME2_ZA_TwoOp_VG2_Multi_Index_Pat; + + def : InstAlias(NAME) MatrixOp16:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm2, + ZZ_b_mul_r:$Zn, ZPR4b8:$Zm, VectorIndexB32b_timm:$i), 0>; } -multiclass sme2_multi_vec_array_vg4_index_16bsz, bits<3>op> { - def NAME: sme2_multi_vec_array_vg24_index_16b { +multiclass sme2_fp8_fmlal_indexed_za16_vgx4 { + def NAME: sme2_multi_vec_array_vg24_index_16b<0b10, 0b1, 0b110, ZZZZ_b_mul_r, mnemonic>, SMEPseudo2Instr { bits<3> Zn; let Inst{9-7} = Zn; let Inst{6} = 0b0; } - def : InstAlias(NAME) MatrixOp16:$ZAda, MatrixIndexGPR32Op8_11:$Rv, - uimm2s2range:$imm2, ZZZZ_b_mul_r:$Zn, ZPR4b8:$Zm, VectorIndexB:$i), 0>; + + def _PSEUDO : sme2_za_array_2op_multi_index_pseudo; + + def : SME2_ZA_TwoOp_VG4_Multi_Index_Pat; + + def : InstAlias(NAME) MatrixOp16:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, + ZZZZ_b_mul_r:$Zn, ZPR4b8:$Zm, VectorIndexB32b_timm:$i), 0>; } //===----------------------------------------------------------------------===// -// SME2 multi-vec indexed long long MLA one source 16-bit -class sme2_mla_ll_array_index_16b sz,bits<2> op> +// FMLAL (single and indexed vector, FP8 to FP16) +class sme2_mla_long_array_index_16b sz,bits<2> op> : I<(outs MatrixOp16:$ZAda), (ins MatrixOp16:$_ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm3s2range:$imm3, ZPR8:$Zn, ZPR4b8:$Zm, VectorIndexB32b_timm:$i), mnemonic, "\t$ZAda[$Rv, $imm3], $Zn, $Zm$i", @@ -2931,9 +3001,18 @@ class sme2_mla_ll_array_index_16b sz,bits<2> op> let Inst{3} = i{0}; let Inst{2-0} = imm3; + let Uses = [FPMR, FPCR]; let Constraints = "$ZAda = $_ZAda"; } +multiclass sme2_fp8_fmlal_indexed_za16 { + def NAME : sme2_mla_long_array_index_16b, SMEPseudo2Instr; + + def _PSEUDO : sme2_za_array_2op_multi_index_pseudo; + + def : SME2_ZA_TwoOp_Multi_Index_Pat; +} + // SME2 multi-vec indexed long long MLA one source 32-bit class sme2_mla_ll_array_index_32b sz, bits<3> op> : I<(outs MatrixOp32:$ZAda), @@ -2967,6 +3046,16 @@ multiclass sme2_mla_ll_array_index_32b sz, bits<3> op, def : SME2_ZA_TwoOp_Multi_Index_Pat; } +multiclass sme2_fp8_fmlall_indexed_za32 { + def NAME : sme2_mla_ll_array_index_32b, SMEPseudo2Instr { + let Uses = [FPMR, FPCR]; + } + + def _PSEUDO : sme2_za_array_2op_multi_index_pseudo; + + def : SME2_ZA_TwoOp_Multi_Index_Pat; +} + // SME2 multi-vec indexed long long MLA one source 64-bit class sme2_mla_ll_array_index_64b op> @@ -3044,6 +3133,22 @@ multiclass sme2_mla_ll_array_vg2_index_32b sz, bits<3> (!cast(NAME) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm1s4range:$imm, ZZ_b_mul_r:$Zn, ZPR4b8:$Zm, VectorIndexB32b_timm:$i), 0>; } +multiclass sme2_fp8_fmlall_indexed_za32_vgx2 { + def NAME : sme2_mla_ll_array_vg24_index_32b<0b10, 0b0, 0b100, ZZ_b_mul_r, mnemonic>, SMEPseudo2Instr { + bits<4> Zn; + let Inst{9-6} = Zn; + let Uses = [FPMR, FPCR]; + } + + def _PSEUDO : sme2_za_array_2op_multi_index_pseudo; + + def : SME2_ZA_TwoOp_VG2_Multi_Index_Pat; + + def : InstAlias(NAME) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm1s4range:$imm, + ZZ_b_mul_r:$Zn, ZPR4b8:$Zm, VectorIndexB32b_timm:$i), 0>; +} + // SME2 multi-vec indexed long long MLA four sources 32-bit multiclass sme2_mla_ll_array_vg4_index_32b sz, bits<4> op, SDPatternOperator intrinsic> { @@ -3060,6 +3165,24 @@ multiclass sme2_mla_ll_array_vg4_index_32b sz, bits<4> def : InstAlias(NAME) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm1s4range:$imm, ZZZZ_b_mul_r:$Zn, ZPR4b8:$Zm, VectorIndexB32b_timm:$i), 0>; } + +multiclass sme2_fp8_fmlall_indexed_za32_vgx4 { + def NAME : sme2_mla_ll_array_vg24_index_32b<0b00, 0b1, 0b000, ZZZZ_b_mul_r, mnemonic>, SMEPseudo2Instr { + bits<3> Zn; + let Inst{9-7} = Zn; + let Inst{6} = 0b1; + let Uses = [FPMR, FPCR]; + } + + def _PSEUDO : sme2_za_array_2op_multi_index_pseudo; + + def : SME2_ZA_TwoOp_VG4_Multi_Index_Pat; + + def : InstAlias(NAME) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, + uimm1s4range:$imm, ZZZZ_b_mul_r:$Zn, ZPR4b8:$Zm, VectorIndexB32b_timm:$i), 0>; +} + class sme2_mla_ll_array_vg24_index_64b op, RegisterOperand vector_ty, string mnemonic> @@ -3159,6 +3282,15 @@ multiclass sme2_mla_ll_array_single op, def : SME2_ZA_TwoOp_Multi_Single_Pat; } +multiclass sme2_fp8_fmlall_single_za32 { + let Uses = [FPMR, FPCR] in + def NAME : sme2_mla_ll_array_single, SMEPseudo2Instr; + + def NAME # _PSEUDO : sme2_za_array_2op_multi_single_pseudo; + + def : SME2_ZA_TwoOp_Multi_Single_Pat; +} + class sme2_mla_ll_array_vg24_single op, MatrixOperand matrix_ty, RegisterOperand vector_ty, ZPRRegOp zpr_ty, string mnemonic> @@ -3188,6 +3320,33 @@ class sme2_mla_ll_array_vg24_single op, MatrixOperand matrix_ty, //SME2 single-multi long long MLA two and four sources +multiclass sme2_fp8_fmlall_single_za32_vgx2 { + let Uses = [FPMR, FPCR] in + def NAME: sme2_mla_ll_array_vg24_single<0b000001, MatrixOp32, ZZ_b, ZPR4b8, mnemonic>, SMEPseudo2Instr; + + def NAME # _PSEUDO : sme2_za_array_2op_multi_single_pseudo; + + def : SME2_ZA_TwoOp_VG2_Multi_Single_Pat; + + def : InstAlias(NAME) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, + uimm1s4range:$imm, ZZ_b:$Zn, ZPR4b8:$Zm), 0>; +} + +multiclass sme2_fp8_fmlall_single_za32_vgx4 { + let Uses = [FPMR, FPCR] in + def NAME: sme2_mla_ll_array_vg24_single<0b010001, MatrixOp32, ZZZZ_b, ZPR4b8, mnemonic>, SMEPseudo2Instr; + + def NAME # _PSEUDO : sme2_za_array_2op_multi_single_pseudo; + + def : SME2_ZA_TwoOp_VG4_Multi_Single_Pat; + + def : InstAlias(NAME) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, + uimm1s4range:$imm, ZZZZ_b:$Zn, ZPR4b8:$Zm), 0>; +} + + multiclass sme2_mla_ll_array_vg24_single op, MatrixOperand matrix_ty, RegisterOperand multi_vector_ty, diff --git a/llvm/test/CodeGen/AArch64/sme2-fp8-intrinsics-fmopa.ll b/llvm/test/CodeGen/AArch64/sme2-fp8-intrinsics-fmopa.ll new file mode 100644 index 0000000000000..6e88cdf4e7fec --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-fp8-intrinsics-fmopa.ll @@ -0,0 +1,22 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme-f8f16,+sme-f8f32 -force-streaming < %s | FileCheck %s + +define void @test_fmopa_16( %pn, %pm, %vn, %vm) { +; CHECK-LABEL: test_fmopa_16: +; CHECK: // %bb.0: +; CHECK-NEXT: fmopa za1.h, p0/m, p1/m, z0.b, z1.b +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.fp8.fmopa.za16(i32 1, %pn, %pm, + %vn, %vm) + ret void +} + +define void @test_fmopa_32( %pn, %pm, %vn, %vm) #0 { +; CHECK-LABEL: test_fmopa_32: +; CHECK: // %bb.0: +; CHECK-NEXT: fmopa za3.s, p0/m, p1/m, z0.b, z1.b +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.fp8.fmopa.za32(i32 3, %pn, %pm, + %vn, %vm) + ret void +} diff --git a/llvm/test/CodeGen/AArch64/sme2-fp8-intrinsics-mla.ll b/llvm/test/CodeGen/AArch64/sme2-fp8-intrinsics-mla.ll new file mode 100644 index 0000000000000..ad225badb244c --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-fp8-intrinsics-mla.ll @@ -0,0 +1,216 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter-out "// kill:" --version 4 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme-f8f16,+sme-f8f32 -force-streaming < %s | FileCheck %s + +; FMLAL (indexed) + +define void @test_fmlal_lane_vg2x1(i32 %slice, %zn, %zm) { +; CHECK-LABEL: test_fmlal_lane_vg2x1: +; CHECK: // %bb.0: +; CHECK: mov w8, w0 +; CHECK: fmlal za.h[w8, 0:1], z0.b, z1.b[0] +; CHECK: fmlal za.h[w8, 14:15], z0.b, z1.b[15] +; CHECK: ret + call void @llvm.aarch64.sme.fp8.fmlal.lane.za16.vg2x1(i32 %slice, + %zn, %zm, + i32 0) + %add = add i32 %slice, 14 + call void @llvm.aarch64.sme.fp8.fmlal.lane.za16.vg2x1(i32 %add, + %zn, %zm, + i32 15) + ret void +} + +define void @test_fmlal_lane_vg2x2(i32 %slice, %zn0, %zn1, %zm) { +; CHECK-LABEL: test_fmlal_lane_vg2x2: +; CHECK: // %bb.0: +; CHECK: mov w8, w0 +; CHECK: fmlal za.h[w8, 0:1, vgx2], { z0.b, z1.b }, z2.b[0] +; CHECK: fmlal za.h[w8, 6:7, vgx2], { z0.b, z1.b }, z2.b[15] +; CHECK: ret + call void @llvm.aarch64.sme.fp8.fmlal.lane.za16.vg2x2(i32 %slice, + %zn0, %zn1, + %zm, + i32 0) + %add = add i32 %slice, 6 + call void @llvm.aarch64.sme.fp8.fmlal.lane.za16.vg2x2(i32 %add, + %zn0, %zn1, + %zm, + i32 15) + ret void +} + +define void @test_fmlal_lane_vg2x4(i32 %slice, %zn0, %zn1, %zn2, %zn3, %zm) { +; CHECK-LABEL: test_fmlal_lane_vg2x4: +; CHECK: // %bb.0: +; CHECK: mov w8, w0 +; CHECK: fmlal za.h[w8, 0:1, vgx4], { z0.b - z3.b }, z4.b[0] +; CHECK: fmlal za.h[w8, 6:7, vgx4], { z0.b - z3.b }, z4.b[15] +; CHECK: ret + call void @llvm.aarch64.sme.fp8.fmlal.lane.za16.vg2x4(i32 %slice, + %zn0, %zn1, %zn2, %zn3, + %zm, + i32 0) + %add = add i32 %slice, 6 + call void @llvm.aarch64.sme.fp8.fmlal.lane.za16.vg2x4(i32 %add, + %zn0, %zn1, %zn2, %zn3, + %zm, + i32 15) + ret void +} + +; FMLALL (indexed) + +define void @test_fmlall_lane_vg4x1(i32 %slice, %zn, %zm) { +; CHECK-LABEL: test_fmlall_lane_vg4x1: +; CHECK: // %bb.0: +; CHECK: mov w8, w0 +; CHECK: fmlall za.s[w8, 0:3], z0.b, z1.b[0] +; CHECK: fmlall za.s[w8, 12:15], z0.b, z1.b[15] +; CHECK: ret + call void @llvm.aarch64.sme.fp8.fmlall.lane.za32.vg4x1(i32 %slice, + %zn, %zm, + i32 0) + %add = add i32 %slice, 12 + call void @llvm.aarch64.sme.fp8.fmlall.lane.za32.vg4x1(i32 %add, + %zn, %zm, + i32 15) + ret void +} + +define void @test_fmlall_lane_vg4x2(i32 %slice, %zn0, %zn1, %zm) { +; CHECK-LABEL: test_fmlall_lane_vg4x2: +; CHECK: // %bb.0: +; CHECK: mov w8, w0 +; CHECK: fmlall za.s[w8, 0:3, vgx2], { z0.b, z1.b }, z2.b[0] +; CHECK: fmlall za.s[w8, 4:7, vgx2], { z0.b, z1.b }, z2.b[15] +; CHECK: ret + call void @llvm.aarch64.sme.fp8.fmlall.lane.za32.vg4x2(i32 %slice, + %zn0, %zn1, + %zm, + i32 0) + %add = add i32 %slice, 4 + call void @llvm.aarch64.sme.fp8.fmlall.lane.za32.vg4x2(i32 %add, + %zn0, %zn1, + %zm, + i32 15) + ret void +} + +define void @test_fmlall_lane_vg4x4(i32 %slice, %zn0, %zn1, %zn2, %zn3, %zm) { +; CHECK-LABEL: test_fmlall_lane_vg4x4: +; CHECK: // %bb.0: +; CHECK: mov w8, w0 +; CHECK: fmlall za.s[w8, 0:3, vgx4], { z0.b - z3.b }, z4.b[8] +; CHECK: fmlall za.s[w8, 4:7, vgx4], { z0.b - z3.b }, z4.b[15] +; CHECK: ret + call void @llvm.aarch64.sme.fp8.fmlall.lane.za32.vg4x4(i32 %slice, + %zn0, %zn1, %zn2, %zn3, + %zm, + i32 8) + %add = add i32 %slice, 4 + call void @llvm.aarch64.sme.fp8.fmlall.lane.za32.vg4x4(i32 %add, + %zn0, %zn1, %zn2, %zn3, + %zm, + i32 15) + ret void +} + + +; FMLAL (single) + +define void @test_fmlal_single_vg2x1(i32 %slice, %zn, %zm) { +; CHECK-LABEL: test_fmlal_single_vg2x1: +; CHECK: // %bb.0: +; CHECK: mov w8, w0 +; CHECK: fmlal za.h[w8, 0:1], z0.b, z1.b +; CHECK: fmlal za.h[w8, 14:15], z0.b, z1.b +; CHECK: ret + call void @llvm.aarch64.sme.fp8.fmlal.single.za16.vg2x1(i32 %slice, %zn, %zm) + %add = add i32 %slice, 14 + call void @llvm.aarch64.sme.fp8.fmlal.single.za16.vg2x1(i32 %add, %zn, %zm) + ret void +} + +define void @test_fmlal_single_vg2x2(i32 %slice, %zn0, %zn1, %zm) { +; CHECK-LABEL: test_fmlal_single_vg2x2: +; CHECK: // %bb.0: +; CHECK: mov w8, w0 +; CHECK: fmlal za.h[w8, 0:1, vgx2], { z0.b, z1.b }, z2.b +; CHECK: fmlal za.h[w8, 6:7, vgx2], { z0.b, z1.b }, z2.b +; CHECK: ret + call void @llvm.aarch64.sme.fp8.fmlal.single.za16.vg2x2(i32 %slice, + %zn0, %zn1, + %zm) + %add = add i32 %slice, 6 + call void @llvm.aarch64.sme.fp8.fmlal.single.za16.vg2x2(i32 %add, + %zn0, %zn1, + %zm) + ret void +} + +define void @test_fmlal_single_vg2x4(i32 %slice, %zn0, %zn1, %zn2, %zn3, %zm) { +; CHECK-LABEL: test_fmlal_single_vg2x4: +; CHECK: // %bb.0: +; CHECK: mov w8, w0 +; CHECK: fmlal za.h[w8, 0:1, vgx4], { z0.b - z3.b }, z4.b +; CHECK: fmlal za.h[w8, 6:7, vgx4], { z0.b - z3.b }, z4.b +; CHECK: ret + call void @llvm.aarch64.sme.fp8.fmlal.single.za16.vg2x4(i32 %slice, + %zn0, %zn1, %zn2, %zn3, + %zm) + %add = add i32 %slice, 6 + call void @llvm.aarch64.sme.fp8.fmlal.single.za16.vg2x4(i32 %add, + %zn0, %zn1, %zn2, %zn3, + %zm) + ret void +} + +; FMLALL (single) + +define void @test_fmlall_single_vg4x1(i32 %slice, %zn, %zm) { +; CHECK-LABEL: test_fmlall_single_vg4x1: +; CHECK: // %bb.0: +; CHECK: mov w8, w0 +; CHECK: fmlall za.s[w8, 0:3], z0.b, z1.b +; CHECK: fmlall za.s[w8, 12:15], z0.b, z1.b +; CHECK: ret + call void @llvm.aarch64.sme.fp8.fmlall.single.za32.vg4x1(i32 %slice, %zn, %zm) + %add = add i32 %slice, 12 + call void @llvm.aarch64.sme.fp8.fmlall.single.za32.vg4x1(i32 %add, %zn, %zm) + ret void +} + + +define void @test_fmlall_single_vg4x2(i32 %slice, %zn0, %zn1, %zm) { +; CHECK-LABEL: test_fmlall_single_vg4x2: +; CHECK: // %bb.0: +; CHECK: mov w8, w0 +; CHECK: fmlall za.s[w8, 0:3, vgx2], { z0.b, z1.b }, z2.b +; CHECK: fmlall za.s[w8, 4:7, vgx2], { z0.b, z1.b }, z2.b +; CHECK: ret + call void @llvm.aarch64.sme.fp8.fmlall.single.za32.vg4x2(i32 %slice, + %zn0, %zn1, + %zm) + %add = add i32 %slice, 4 + call void @llvm.aarch64.sme.fp8.fmlall.single.za32.vg4x2(i32 %add, + %zn0, %zn1, + %zm) + ret void +} + +define void @test_fmlall_single_vg4x4(i32 %slice, %zn0, %zn1, %zn2, %zn3, %zm) { +; CHECK-LABEL: test_fmlall_single_vg4x4: +; CHECK: // %bb.0: +; CHECK: mov w8, w0 +; CHECK: fmlall za.s[w8, 0:3, vgx4], { z0.b - z3.b }, z4.b +; CHECK: fmlall za.s[w8, 4:7, vgx4], { z0.b - z3.b }, z4.b +; CHECK: ret + call void @llvm.aarch64.sme.fp8.fmlall.single.za32.vg4x4(i32 %slice, + %zn0, %zn1, %zn2, %zn3, + %zm) + %add = add i32 %slice, 4 + call void @llvm.aarch64.sme.fp8.fmlall.single.za32.vg4x4(i32 %add, + %zn0, %zn1, %zn2, %zn3, + %zm) + ret void +}