-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[AArch64] Implement intrinsics for SME FP8 FMLAL/FMLALL (single) #118869
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-clang @llvm/pr-subscribers-backend-aarch64 Author: None (SpencerAbson) ChangesThis patch implements the following intrinsics: Multi-vector 8-bit floating-point multiply-add long (single) // Only if __ARM_FEATURE_SME_F8F16 != 0
void svmla[_single]_za16[_mf8]_vg2x1_fpm(uint32_t slice, svmfloat8_t zn,
svmfloat8_t zm, fpm_t fpm)
__arm_streaming __arm_inout("za");
void svmla[_single]_za16[_mf8]_vg2x2_fpm(uint32_t slice, svmfloat8x2_t zn,
svmfloat8_t zm, fpm_t fpm)
__arm_streaming __arm_inout("za");
void svmla[_single]_za16[_mf8]_vg2x4_fpm(uint32_t slice, svmfloat8x4_t zn,
svmfloat8_t zm, fpm_t fpm)
__arm_streaming __arm_inout("za");
// Only if __ARM_FEATURE_SME_F8F32 != 0
void svmla[_single]_za32[_mf8]_vg4x1_fpm(uint32_t slice, svmfloat8_t zn,
svmfloat8_t zm, fpm_t fpm)
__arm_streaming __arm_inout("za");
void svmla[_single]_za32[_mf8]_vg4x2_fpm(uint32_t slice, svmfloat8x2_t zn,
svmfloat8_t zm, fpm_t fpm)
__arm_streaming __arm_inout("za");
void svmla[_single]_za32[_mf8]_vg4x4_fpm(uint32_t slice, svmfloat8x4_t zn,
svmfloat8_t zm, fpm_t fpm)
__arm_streaming __arm_inout("za"); In accordance with ARM-software/acle#323. Co-authored-by: Momchil Velikov [email protected] Note This PR is stacked on #118549 Patch is 74.36 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/118869.diff 12 Files Affected:
diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 0f689e82bdb742..f48374c179c4d7 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -824,4 +824,46 @@ let SMETargetGuard = "sme-lutv2" in {
def SVLUTI4_ZT_X4 : SInst<"svluti4_zt_{d}_x4", "4i2.u", "cUc", MergeNone, "aarch64_sme_luti4_zt_x4", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>;
}
+let SMETargetGuard = "sme-f8f32" in {
+ def SVMOPA_FP8_ZA32 : Inst<"svmopa_za32[_mf8]_m_fpm", "viPPdd>", "m", MergeNone, "aarch64_sme_fp8_fmopa_za32",
+ [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<0, ImmCheck0_3>]>;
+
+ // FMLALL (indexed)
+ def SVMLA_FP8_LANE_ZA32_VG4x1 : Inst<"svmla_lane_za32[_mf8]_vg4x1_fpm", "vmddi>", "m", MergeNone, "aarch64_sme_fp8_fmlall_lane_za32_vg4x1",
+ [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
+ def SVMLA_FP8_LANE_ZA32_VG4x2 : Inst<"svmla_lane_za32[_mf8]_vg4x2_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fmlall_lane_za32_vg4x2",
+ [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
+ def SVMLA_FP8_LANE_ZA16_VG4x4 : Inst<"svmla_lane_za32[_mf8]_vg4x4_fpm", "vm4di>", "m", MergeNone, "aarch64_sme_fp8_fmlall_lane_za32_vg4x4",
+ [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
+
+ // FMLAL (mutliple and single)
+ def SVMLA_FP8_SINGLE_ZA32_VG4x1 : Inst<"svmla[_single]_za32[_mf8]_vg4x1_fpm", "vmdd>", "m", MergeNone, "aarch64_sme_fp8_fmlall_single_za32_vg4x1",
+ [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
+ def SVMLA_FP8_SINGLE_ZA32_VG4x2 : Inst<"svmla[_single]_za32[_mf8]_vg4x2_fpm", "vm2d>", "m", MergeNone, "aarch64_sme_fp8_fmlall_single_za32_vg4x2",
+ [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
+ def SVMLA_FP8_SINGLE_ZA32_VG4x4 : Inst<"svmla[_single]_za32[_mf8]_vg4x4_fpm", "vm4d>", "m", MergeNone, "aarch64_sme_fp8_fmlall_single_za32_vg4x4",
+ [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
+}
+
+let SMETargetGuard = "sme-f8f16" in {
+ def SVMOPA_FP8_ZA16 : Inst<"svmopa_za16[_mf8]_m_fpm", "viPPdd>", "m", MergeNone, "aarch64_sme_fp8_fmopa_za16",
+ [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<0, ImmCheck0_1>]>;
+
+ // FMLAL (indexed)
+ def SVMLA_FP8_LANE_ZA16_VG2x1 : Inst<"svmla_lane_za16[_mf8]_vg2x1_fpm", "vmddi>", "m", MergeNone, "aarch64_sme_fp8_fmlal_lane_za16_vg2x1",
+ [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
+ def SVMLA_FP8_LANE_ZA16_VG2x2 : Inst<"svmla_lane_za16[_mf8]_vg2x2_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fmlal_lane_za16_vg2x2",
+ [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
+ def SVMLA_FP8_LANE_ZA16_VG2x4 : Inst<"svmla_lane_za16[_mf8]_vg2x4_fpm", "vm4di>", "m", MergeNone, "aarch64_sme_fp8_fmlal_lane_za16_vg2x4",
+ [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
+
+ // FMLAL (mutliple and single)
+ def SVMLA_FP8_SINGLE_ZA16_VG2x1 : Inst<"svmla[_single]_za16[_mf8]_vg2x1_fpm", "vmdd>", "m", MergeNone, "aarch64_sme_fp8_fmlal_single_za16_vg2x1",
+ [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
+ def SVMLA_FP8_SINGLE_ZA16_VG2x2 : Inst<"svmla[_single]_za16[_mf8]_vg2x2_fpm", "vm2d>", "m", MergeNone, "aarch64_sme_fp8_fmlal_single_za16_vg2x2",
+ [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
+ def SVMLA_FP8_SINGLE_ZA16_VG2x4 : Inst<"svmla[_single]_za16[_mf8]_vg2x4_fpm", "vm4d>", "m", MergeNone, "aarch64_sme_fp8_fmlal_single_za16_vg2x4",
+ [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
+}
+
} // let SVETargetGuard = InvalidMode
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index cb9c23b8e0a0d0..56595bb4704e74 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -10183,6 +10183,8 @@ CodeGenFunction::getSVEType(const SVETypeFlags &TypeFlags) {
case SVETypeFlags::EltTyInt64:
return llvm::ScalableVectorType::get(Builder.getInt64Ty(), 2);
+ case SVETypeFlags::EltTyMFloat8:
+ return llvm::ScalableVectorType::get(Builder.getInt8Ty(), 16);
case SVETypeFlags::EltTyFloat16:
return llvm::ScalableVectorType::get(Builder.getHalfTy(), 8);
case SVETypeFlags::EltTyBFloat16:
@@ -11234,6 +11236,10 @@ Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID,
BuiltinID == SME::BI__builtin_sme_svstr_za)
return EmitSMELdrStr(TypeFlags, Ops, Builtin->LLVMIntrinsic);
+ // Emit set FPMR for intrinsics that require it
+ if (TypeFlags.setsFPMR())
+ Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_set_fpmr),
+ Ops.pop_back_val());
// Handle builtins which require their multi-vector operands to be swapped
swapCommutativeSMEOperands(BuiltinID, Ops);
diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_fmopa.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_fmopa.c
new file mode 100644
index 00000000000000..95d6383ab30efe
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_fmopa.c
@@ -0,0 +1,55 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+#include <arm_sme.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+
+// CHECK-LABEL: define dso_local void @test_svmopa_za16_mf8_m(
+// CHECK-SAME: <vscale x 16 x i1> [[PN:%.*]], <vscale x 16 x i1> [[PM:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmopa.za16(i32 1, <vscale x 16 x i1> [[PN]], <vscale x 16 x i1> [[PM]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z22test_svmopa_za16_mf8_mu10__SVBool_tS_u13__SVMfloat8_tS0_m(
+// CPP-CHECK-SAME: <vscale x 16 x i1> [[PN:%.*]], <vscale x 16 x i1> [[PM:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmopa.za16(i32 1, <vscale x 16 x i1> [[PN]], <vscale x 16 x i1> [[PM]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmopa_za16_mf8_m(svbool_t pn, svbool_t pm, svmfloat8_t zn,
+ svmfloat8_t zm, fpm_t fpmr) __arm_streaming __arm_inout("za") {
+ SVE_ACLE_FUNC(svmopa_za16,_mf8,_m_fpm)(1, pn, pm, zn, zm, fpmr);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmopa_za32_mf8_m(
+// CHECK-SAME: <vscale x 16 x i1> [[PN:%.*]], <vscale x 16 x i1> [[PM:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmopa.za32(i32 3, <vscale x 16 x i1> [[PN]], <vscale x 16 x i1> [[PM]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z22test_svmopa_za32_mf8_mu10__SVBool_tS_u13__SVMfloat8_tS0_m(
+// CPP-CHECK-SAME: <vscale x 16 x i1> [[PN:%.*]], <vscale x 16 x i1> [[PM:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmopa.za32(i32 3, <vscale x 16 x i1> [[PN]], <vscale x 16 x i1> [[PM]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmopa_za32_mf8_m(svbool_t pn, svbool_t pm, svmfloat8_t zn,
+ svmfloat8_t zm, fpm_t fpmr) __arm_streaming __arm_inout("za") {
+ SVE_ACLE_FUNC(svmopa_za32,_mf8,_m_fpm)(3, pn, pm, zn, zm, fpmr);
+}
diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_mla.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_mla.c
new file mode 100644
index 00000000000000..85870f5e4c353b
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_mla.c
@@ -0,0 +1,236 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+#include <arm_sme.h>
+
+#ifdef SME_OVERLOADED_FORMS
+#define SME_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED, A5) A1##A3##A5
+#else
+#define SME_ACLE_FUNC(A1, A2, A3, A4, A5) A1##A2##A3##A4##A5
+#endif
+
+// FMLAL (indexed)
+
+// CHECK-LABEL: define dso_local void @test_svmla_lane_za16_vg2x1(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.lane.za16.vg2x1(i32 [[SLICE]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]], i32 0)
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z26test_svmla_lane_za16_vg2x1ju13__SVMfloat8_tS_m(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.lane.za16.vg2x1(i32 [[SLICE]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]], i32 0)
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmla_lane_za16_vg2x1(uint32_t slice, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmla_lane_za16,_mf8,_vg2x1_fpm,,)(slice, zn, zm, 0, fpm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_lane_za16_vg2x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.lane.za16.vg2x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]], i32 15)
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z26test_svmla_lane_za16_vg2x2j13svmfloat8x2_tu13__SVMfloat8_tm(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.lane.za16.vg2x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]], i32 15)
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmla_lane_za16_vg2x2(uint32_t slice, svmfloat8x2_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmla_lane_za16,_mf8,_vg2x2_fpm,,)(slice, zn, zm, 15, fpm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_lane_za16_vg2x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.lane.za16.vg2x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM]], i32 7)
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z26test_svmla_lane_za16_vg2x4j13svmfloat8x4_tu13__SVMfloat8_tm(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.lane.za16.vg2x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM]], i32 7)
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmla_lane_za16_vg2x4(uint32_t slice, svmfloat8x4_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmla_lane_za16,_mf8,_vg2x4_fpm,,)(slice, zn, zm, 7, fpm);
+}
+
+// FMLALL (indexed)
+
+// CHECK-LABEL: define dso_local void @test_svmla_lane_za32_vg4x1(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.lane.za32.vg4x1(i32 [[SLICE]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]], i32 0)
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z26test_svmla_lane_za32_vg4x1ju13__SVMfloat8_tS_m(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.lane.za32.vg4x1(i32 [[SLICE]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]], i32 0)
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmla_lane_za32_vg4x1(uint32_t slice, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmla_lane_za32,_mf8,_vg4x1_fpm,,)(slice, zn, zm, 0, fpm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_lane_za32_vg4x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.lane.za32.vg4x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]], i32 15)
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z26test_svmla_lane_za32_vg4x2j13svmfloat8x2_tu13__SVMfloat8_tm(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.lane.za32.vg4x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]], i32 15)
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmla_lane_za32_vg4x2(uint32_t slice, svmfloat8x2_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmla_lane_za32,_mf8,_vg4x2_fpm,,)(slice, zn, zm, 15, fpm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_lane_za32_vg4x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.lane.za32.vg4x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM]], i32 7)
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z26test_svmla_lane_za32_vg4x4j13svmfloat8x4_tu13__SVMfloat8_tm(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8...
[truncated]
|
@llvm/pr-subscribers-clang-codegen Author: None (SpencerAbson) ChangesThis patch implements the following intrinsics: Multi-vector 8-bit floating-point multiply-add long (single) // Only if __ARM_FEATURE_SME_F8F16 != 0
void svmla[_single]_za16[_mf8]_vg2x1_fpm(uint32_t slice, svmfloat8_t zn,
svmfloat8_t zm, fpm_t fpm)
__arm_streaming __arm_inout("za");
void svmla[_single]_za16[_mf8]_vg2x2_fpm(uint32_t slice, svmfloat8x2_t zn,
svmfloat8_t zm, fpm_t fpm)
__arm_streaming __arm_inout("za");
void svmla[_single]_za16[_mf8]_vg2x4_fpm(uint32_t slice, svmfloat8x4_t zn,
svmfloat8_t zm, fpm_t fpm)
__arm_streaming __arm_inout("za");
// Only if __ARM_FEATURE_SME_F8F32 != 0
void svmla[_single]_za32[_mf8]_vg4x1_fpm(uint32_t slice, svmfloat8_t zn,
svmfloat8_t zm, fpm_t fpm)
__arm_streaming __arm_inout("za");
void svmla[_single]_za32[_mf8]_vg4x2_fpm(uint32_t slice, svmfloat8x2_t zn,
svmfloat8_t zm, fpm_t fpm)
__arm_streaming __arm_inout("za");
void svmla[_single]_za32[_mf8]_vg4x4_fpm(uint32_t slice, svmfloat8x4_t zn,
svmfloat8_t zm, fpm_t fpm)
__arm_streaming __arm_inout("za"); In accordance with ARM-software/acle#323. Co-authored-by: Momchil Velikov [email protected] Note This PR is stacked on #118549 Patch is 74.36 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/118869.diff 12 Files Affected:
diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 0f689e82bdb742..f48374c179c4d7 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -824,4 +824,46 @@ let SMETargetGuard = "sme-lutv2" in {
def SVLUTI4_ZT_X4 : SInst<"svluti4_zt_{d}_x4", "4i2.u", "cUc", MergeNone, "aarch64_sme_luti4_zt_x4", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>;
}
+let SMETargetGuard = "sme-f8f32" in {
+ def SVMOPA_FP8_ZA32 : Inst<"svmopa_za32[_mf8]_m_fpm", "viPPdd>", "m", MergeNone, "aarch64_sme_fp8_fmopa_za32",
+ [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<0, ImmCheck0_3>]>;
+
+ // FMLALL (indexed)
+ def SVMLA_FP8_LANE_ZA32_VG4x1 : Inst<"svmla_lane_za32[_mf8]_vg4x1_fpm", "vmddi>", "m", MergeNone, "aarch64_sme_fp8_fmlall_lane_za32_vg4x1",
+ [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
+ def SVMLA_FP8_LANE_ZA32_VG4x2 : Inst<"svmla_lane_za32[_mf8]_vg4x2_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fmlall_lane_za32_vg4x2",
+ [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
+ def SVMLA_FP8_LANE_ZA16_VG4x4 : Inst<"svmla_lane_za32[_mf8]_vg4x4_fpm", "vm4di>", "m", MergeNone, "aarch64_sme_fp8_fmlall_lane_za32_vg4x4",
+ [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
+
+ // FMLAL (mutliple and single)
+ def SVMLA_FP8_SINGLE_ZA32_VG4x1 : Inst<"svmla[_single]_za32[_mf8]_vg4x1_fpm", "vmdd>", "m", MergeNone, "aarch64_sme_fp8_fmlall_single_za32_vg4x1",
+ [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
+ def SVMLA_FP8_SINGLE_ZA32_VG4x2 : Inst<"svmla[_single]_za32[_mf8]_vg4x2_fpm", "vm2d>", "m", MergeNone, "aarch64_sme_fp8_fmlall_single_za32_vg4x2",
+ [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
+ def SVMLA_FP8_SINGLE_ZA32_VG4x4 : Inst<"svmla[_single]_za32[_mf8]_vg4x4_fpm", "vm4d>", "m", MergeNone, "aarch64_sme_fp8_fmlall_single_za32_vg4x4",
+ [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
+}
+
+let SMETargetGuard = "sme-f8f16" in {
+ def SVMOPA_FP8_ZA16 : Inst<"svmopa_za16[_mf8]_m_fpm", "viPPdd>", "m", MergeNone, "aarch64_sme_fp8_fmopa_za16",
+ [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<0, ImmCheck0_1>]>;
+
+ // FMLAL (indexed)
+ def SVMLA_FP8_LANE_ZA16_VG2x1 : Inst<"svmla_lane_za16[_mf8]_vg2x1_fpm", "vmddi>", "m", MergeNone, "aarch64_sme_fp8_fmlal_lane_za16_vg2x1",
+ [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
+ def SVMLA_FP8_LANE_ZA16_VG2x2 : Inst<"svmla_lane_za16[_mf8]_vg2x2_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fmlal_lane_za16_vg2x2",
+ [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
+ def SVMLA_FP8_LANE_ZA16_VG2x4 : Inst<"svmla_lane_za16[_mf8]_vg2x4_fpm", "vm4di>", "m", MergeNone, "aarch64_sme_fp8_fmlal_lane_za16_vg2x4",
+ [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
+
+ // FMLAL (mutliple and single)
+ def SVMLA_FP8_SINGLE_ZA16_VG2x1 : Inst<"svmla[_single]_za16[_mf8]_vg2x1_fpm", "vmdd>", "m", MergeNone, "aarch64_sme_fp8_fmlal_single_za16_vg2x1",
+ [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
+ def SVMLA_FP8_SINGLE_ZA16_VG2x2 : Inst<"svmla[_single]_za16[_mf8]_vg2x2_fpm", "vm2d>", "m", MergeNone, "aarch64_sme_fp8_fmlal_single_za16_vg2x2",
+ [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
+ def SVMLA_FP8_SINGLE_ZA16_VG2x4 : Inst<"svmla[_single]_za16[_mf8]_vg2x4_fpm", "vm4d>", "m", MergeNone, "aarch64_sme_fp8_fmlal_single_za16_vg2x4",
+ [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
+}
+
} // let SVETargetGuard = InvalidMode
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index cb9c23b8e0a0d0..56595bb4704e74 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -10183,6 +10183,8 @@ CodeGenFunction::getSVEType(const SVETypeFlags &TypeFlags) {
case SVETypeFlags::EltTyInt64:
return llvm::ScalableVectorType::get(Builder.getInt64Ty(), 2);
+ case SVETypeFlags::EltTyMFloat8:
+ return llvm::ScalableVectorType::get(Builder.getInt8Ty(), 16);
case SVETypeFlags::EltTyFloat16:
return llvm::ScalableVectorType::get(Builder.getHalfTy(), 8);
case SVETypeFlags::EltTyBFloat16:
@@ -11234,6 +11236,10 @@ Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID,
BuiltinID == SME::BI__builtin_sme_svstr_za)
return EmitSMELdrStr(TypeFlags, Ops, Builtin->LLVMIntrinsic);
+ // Emit set FPMR for intrinsics that require it
+ if (TypeFlags.setsFPMR())
+ Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_set_fpmr),
+ Ops.pop_back_val());
// Handle builtins which require their multi-vector operands to be swapped
swapCommutativeSMEOperands(BuiltinID, Ops);
diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_fmopa.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_fmopa.c
new file mode 100644
index 00000000000000..95d6383ab30efe
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_fmopa.c
@@ -0,0 +1,55 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+#include <arm_sme.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+
+// CHECK-LABEL: define dso_local void @test_svmopa_za16_mf8_m(
+// CHECK-SAME: <vscale x 16 x i1> [[PN:%.*]], <vscale x 16 x i1> [[PM:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmopa.za16(i32 1, <vscale x 16 x i1> [[PN]], <vscale x 16 x i1> [[PM]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z22test_svmopa_za16_mf8_mu10__SVBool_tS_u13__SVMfloat8_tS0_m(
+// CPP-CHECK-SAME: <vscale x 16 x i1> [[PN:%.*]], <vscale x 16 x i1> [[PM:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmopa.za16(i32 1, <vscale x 16 x i1> [[PN]], <vscale x 16 x i1> [[PM]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmopa_za16_mf8_m(svbool_t pn, svbool_t pm, svmfloat8_t zn,
+ svmfloat8_t zm, fpm_t fpmr) __arm_streaming __arm_inout("za") {
+ SVE_ACLE_FUNC(svmopa_za16,_mf8,_m_fpm)(1, pn, pm, zn, zm, fpmr);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmopa_za32_mf8_m(
+// CHECK-SAME: <vscale x 16 x i1> [[PN:%.*]], <vscale x 16 x i1> [[PM:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmopa.za32(i32 3, <vscale x 16 x i1> [[PN]], <vscale x 16 x i1> [[PM]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z22test_svmopa_za32_mf8_mu10__SVBool_tS_u13__SVMfloat8_tS0_m(
+// CPP-CHECK-SAME: <vscale x 16 x i1> [[PN:%.*]], <vscale x 16 x i1> [[PM:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmopa.za32(i32 3, <vscale x 16 x i1> [[PN]], <vscale x 16 x i1> [[PM]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmopa_za32_mf8_m(svbool_t pn, svbool_t pm, svmfloat8_t zn,
+ svmfloat8_t zm, fpm_t fpmr) __arm_streaming __arm_inout("za") {
+ SVE_ACLE_FUNC(svmopa_za32,_mf8,_m_fpm)(3, pn, pm, zn, zm, fpmr);
+}
diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_mla.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_mla.c
new file mode 100644
index 00000000000000..85870f5e4c353b
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_mla.c
@@ -0,0 +1,236 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+#include <arm_sme.h>
+
+#ifdef SME_OVERLOADED_FORMS
+#define SME_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED, A5) A1##A3##A5
+#else
+#define SME_ACLE_FUNC(A1, A2, A3, A4, A5) A1##A2##A3##A4##A5
+#endif
+
+// FMLAL (indexed)
+
+// CHECK-LABEL: define dso_local void @test_svmla_lane_za16_vg2x1(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.lane.za16.vg2x1(i32 [[SLICE]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]], i32 0)
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z26test_svmla_lane_za16_vg2x1ju13__SVMfloat8_tS_m(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.lane.za16.vg2x1(i32 [[SLICE]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]], i32 0)
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmla_lane_za16_vg2x1(uint32_t slice, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmla_lane_za16,_mf8,_vg2x1_fpm,,)(slice, zn, zm, 0, fpm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_lane_za16_vg2x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.lane.za16.vg2x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]], i32 15)
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z26test_svmla_lane_za16_vg2x2j13svmfloat8x2_tu13__SVMfloat8_tm(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.lane.za16.vg2x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]], i32 15)
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmla_lane_za16_vg2x2(uint32_t slice, svmfloat8x2_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmla_lane_za16,_mf8,_vg2x2_fpm,,)(slice, zn, zm, 15, fpm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_lane_za16_vg2x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.lane.za16.vg2x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM]], i32 7)
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z26test_svmla_lane_za16_vg2x4j13svmfloat8x4_tu13__SVMfloat8_tm(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.lane.za16.vg2x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM]], i32 7)
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmla_lane_za16_vg2x4(uint32_t slice, svmfloat8x4_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmla_lane_za16,_mf8,_vg2x4_fpm,,)(slice, zn, zm, 7, fpm);
+}
+
+// FMLALL (indexed)
+
+// CHECK-LABEL: define dso_local void @test_svmla_lane_za32_vg4x1(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.lane.za32.vg4x1(i32 [[SLICE]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]], i32 0)
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z26test_svmla_lane_za32_vg4x1ju13__SVMfloat8_tS_m(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.lane.za32.vg4x1(i32 [[SLICE]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]], i32 0)
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmla_lane_za32_vg4x1(uint32_t slice, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmla_lane_za32,_mf8,_vg4x1_fpm,,)(slice, zn, zm, 0, fpm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_lane_za32_vg4x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.lane.za32.vg4x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]], i32 15)
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z26test_svmla_lane_za32_vg4x2j13svmfloat8x2_tu13__SVMfloat8_tm(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.lane.za32.vg4x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]], i32 15)
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmla_lane_za32_vg4x2(uint32_t slice, svmfloat8x2_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmla_lane_za32,_mf8,_vg4x2_fpm,,)(slice, zn, zm, 15, fpm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_lane_za32_vg4x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.lane.za32.vg4x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM]], i32 7)
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z26test_svmla_lane_za32_vg4x4j13svmfloat8x4_tu13__SVMfloat8_tm(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8...
[truncated]
|
Please see #119568. Cheers. |
This patch implements the following intrinsics:
Multi-vector 8-bit floating-point multiply-add long (single)
In accordance with ARM-software/acle#323.
Co-authored-by: Momchil Velikov [email protected]
Note This PR is stacked on #118549