Skip to content

Commit 2371a64

Browse files
[AArch64] Add intrinsics for non-widening FMOPA/FMOPS (#88105)
According to the specification in ARM-software/acle#309 this adds the intrinsics void svmopa_za16[_f16]_m(uint64_t tile, svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za"); void svmops_za16[_f16]_m(uint64_t tile, svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za"); as well as the corresponding `bf16` variants.
1 parent fa4e899 commit 2371a64

File tree

7 files changed

+219
-7
lines changed

7 files changed

+219
-7
lines changed

clang/include/clang/Basic/arm_sme.td

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -708,3 +708,27 @@ let TargetGuard = "sme2" in {
708708
def SVLUTI2_LANE_ZT_X2 : Inst<"svluti2_lane_zt_{d}_x2", "2.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti2_lane_zt_x2", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_7>]>;
709709
def SVLUTI4_LANE_ZT_X2 : Inst<"svluti4_lane_zt_{d}_x2", "2.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti4_lane_zt_x2", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_3>]>;
710710
}
711+
712+
////////////////////////////////////////////////////////////////////////////////
713+
// SME2p1 - FMOPA, FMOPS (non-widening)
714+
let TargetGuard = "sme2,b16b16" in {
715+
def SVMOPA_BF16_NW : SInst<"svmopa_za16[_bf16]_m", "viPPdd", "b",
716+
MergeNone, "aarch64_sme_mopa",
717+
[IsStreaming, IsInOutZA],
718+
[ImmCheck<0, ImmCheck0_1>]>;
719+
def SVMOPS_BF16_NW : SInst<"svmops_za16[_bf16]_m", "viPPdd", "b",
720+
MergeNone, "aarch64_sme_mops",
721+
[IsStreaming, IsInOutZA],
722+
[ImmCheck<0, ImmCheck0_1>]>;
723+
}
724+
725+
let TargetGuard = "sme-f16f16" in {
726+
def SVMOPA_F16_NW : SInst<"svmopa_za16[_f16]_m", "viPPdd", "h",
727+
MergeNone, "aarch64_sme_mopa",
728+
[IsStreaming, IsInOutZA],
729+
[ImmCheck<0, ImmCheck0_1>]>;
730+
def SVMOPS_F16_NW : SInst<"svmops_za16[_f16]_m", "viPPdd", "h",
731+
MergeNone, "aarch64_sme_mops",
732+
[IsStreaming, IsInOutZA],
733+
[ImmCheck<0, ImmCheck0_1>]>;
734+
}
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
2+
// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +b16b16 -target-feature +sme-f16f16 -O2 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK
3+
// RUN: %clang_cc1 -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +b16b16 -target-feature +sme-f16f16 -O2 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK-CXX
4+
// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +b16b16 -target-feature +sme-f16f16 -O2 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK
5+
// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +b16b16 -target-feature +sme-f16f16 -O2 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK-CXX
6+
7+
// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +b16b16 -target-feature +sme-f16f16 -S -O2 -Werror -o /dev/null %s
8+
9+
// REQUIRES: aarch64-registered-target
10+
11+
#include <arm_sme.h>
12+
13+
#ifdef SME_OVERLOADED_FORMS
14+
#define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
15+
#else
16+
#define SME_ACLE_FUNC(A1,A2,A3) A1##A2##A3
17+
#endif
18+
19+
// CHECK-LABEL: define dso_local void @test_svmopa_za16_bf16(
20+
// CHECK-SAME: <vscale x 16 x i1> [[PN:%.*]], <vscale x 16 x i1> [[PM:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
21+
// CHECK-NEXT: entry:
22+
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PN]])
23+
// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PM]])
24+
// CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.nxv8bf16(i32 0, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x bfloat> [[ZN]], <vscale x 8 x bfloat> [[ZM]])
25+
// CHECK-NEXT: ret void
26+
//
27+
// CHECK-CXX-LABEL: define dso_local void @_Z21test_svmopa_za16_bf16u10__SVBool_tS_u14__SVBfloat16_tS0_(
28+
// CHECK-CXX-SAME: <vscale x 16 x i1> [[PN:%.*]], <vscale x 16 x i1> [[PM:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
29+
// CHECK-CXX-NEXT: entry:
30+
// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PN]])
31+
// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PM]])
32+
// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.mopa.nxv8bf16(i32 0, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x bfloat> [[ZN]], <vscale x 8 x bfloat> [[ZM]])
33+
// CHECK-CXX-NEXT: ret void
34+
//
35+
void test_svmopa_za16_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
36+
SME_ACLE_FUNC(svmopa_za16, _bf16, _m)(0, pn, pm, zn, zm);
37+
}
38+
39+
// CHECK-LABEL: define dso_local void @test_svmops_za16_bf16(
40+
// CHECK-SAME: <vscale x 16 x i1> [[PN:%.*]], <vscale x 16 x i1> [[PM:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
41+
// CHECK-NEXT: entry:
42+
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PN]])
43+
// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PM]])
44+
// CHECK-NEXT: tail call void @llvm.aarch64.sme.mops.nxv8bf16(i32 0, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x bfloat> [[ZN]], <vscale x 8 x bfloat> [[ZM]])
45+
// CHECK-NEXT: ret void
46+
//
47+
// CHECK-CXX-LABEL: define dso_local void @_Z21test_svmops_za16_bf16u10__SVBool_tS_u14__SVBfloat16_tS0_(
48+
// CHECK-CXX-SAME: <vscale x 16 x i1> [[PN:%.*]], <vscale x 16 x i1> [[PM:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
49+
// CHECK-CXX-NEXT: entry:
50+
// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PN]])
51+
// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PM]])
52+
// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.mops.nxv8bf16(i32 0, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x bfloat> [[ZN]], <vscale x 8 x bfloat> [[ZM]])
53+
// CHECK-CXX-NEXT: ret void
54+
//
55+
void test_svmops_za16_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
56+
SME_ACLE_FUNC(svmops_za16, _bf16, _m)(0, pn, pm, zn, zm);
57+
}
58+
59+
// CHECK-LABEL: define dso_local void @test_svmopa_za16_f16(
60+
// CHECK-SAME: <vscale x 16 x i1> [[PN:%.*]], <vscale x 16 x i1> [[PM:%.*]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
61+
// CHECK-NEXT: entry:
62+
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PN]])
63+
// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PM]])
64+
// CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.nxv8f16(i32 0, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[ZN]], <vscale x 8 x half> [[ZM]])
65+
// CHECK-NEXT: ret void
66+
//
67+
// CHECK-CXX-LABEL: define dso_local void @_Z20test_svmopa_za16_f16u10__SVBool_tS_u13__SVFloat16_tS0_(
68+
// CHECK-CXX-SAME: <vscale x 16 x i1> [[PN:%.*]], <vscale x 16 x i1> [[PM:%.*]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
69+
// CHECK-CXX-NEXT: entry:
70+
// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PN]])
71+
// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PM]])
72+
// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.mopa.nxv8f16(i32 0, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[ZN]], <vscale x 8 x half> [[ZM]])
73+
// CHECK-CXX-NEXT: ret void
74+
//
75+
void test_svmopa_za16_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
76+
SME_ACLE_FUNC(svmopa_za16, _f16, _m)(0, pn, pm, zn, zm);
77+
}
78+
79+
// CHECK-LABEL: define dso_local void @test_svmops_za16_f16(
80+
// CHECK-SAME: <vscale x 16 x i1> [[PN:%.*]], <vscale x 16 x i1> [[PM:%.*]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
81+
// CHECK-NEXT: entry:
82+
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PN]])
83+
// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PM]])
84+
// CHECK-NEXT: tail call void @llvm.aarch64.sme.mops.nxv8f16(i32 0, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[ZN]], <vscale x 8 x half> [[ZM]])
85+
// CHECK-NEXT: ret void
86+
//
87+
// CHECK-CXX-LABEL: define dso_local void @_Z20test_svmops_za16_f16u10__SVBool_tS_u13__SVFloat16_tS0_(
88+
// CHECK-CXX-SAME: <vscale x 16 x i1> [[PN:%.*]], <vscale x 16 x i1> [[PM:%.*]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
89+
// CHECK-CXX-NEXT: entry:
90+
// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PN]])
91+
// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PM]])
92+
// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.mops.nxv8f16(i32 0, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[ZN]], <vscale x 8 x half> [[ZM]])
93+
// CHECK-CXX-NEXT: ret void
94+
//
95+
void test_svmops_za16_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
96+
SME_ACLE_FUNC(svmops_za16, _f16, _m)(0, pn, pm, zn, zm);
97+
}
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -verify -emit-llvm %s
2+
3+
// REQUIRES: aarch64-registered-target
4+
5+
#include <arm_sme.h>
6+
7+
void test_features(svbool_t pn, svbool_t pm,
8+
svfloat16_t zn, svfloat16_t zm,
9+
svbfloat16_t znb, svbfloat16_t zmb)
10+
__arm_streaming __arm_inout("za") {
11+
// expected-error@+1 {{'svmopa_za16_bf16_m' needs target feature sme2,b16b16}}
12+
svmopa_za16_bf16_m(0, pn, pm, znb, zmb);
13+
// expected-error@+1 {{'svmops_za16_bf16_m' needs target feature sme2,b16b16}}
14+
svmops_za16_bf16_m(0, pn, pm, znb, zmb);
15+
// expected-error@+1 {{'svmopa_za16_f16_m' needs target feature sme-f16f16}}
16+
svmopa_za16_f16_m(0, pn, pm, zn, zm);
17+
// expected-error@+1 {{'svmops_za16_f16_m' needs target feature sme-f16f16}}
18+
svmops_za16_f16_m(0, pn, pm, zn, zm);
19+
}
20+
21+
void test_imm(svbool_t pn, svbool_t pm,
22+
svfloat16_t zn, svfloat16_t zm,
23+
svbfloat16_t znb, svbfloat16_t zmb)
24+
__arm_streaming __arm_inout("za") {
25+
// expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 1]}}
26+
svmopa_za16_bf16_m(-1, pn, pm, znb, zmb);
27+
// expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 1]}}
28+
svmops_za16_bf16_m(-1, pn, pm, znb, zmb);
29+
// expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 1]}}
30+
svmopa_za16_f16_m(-1, pn, pm, zn, zm);
31+
// expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 1]}}
32+
svmops_za16_f16_m(-1, pn, pm, zn, zm);
33+
}
34+

llvm/include/llvm/IR/IntrinsicsAArch64.td

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3649,3 +3649,6 @@ def int_aarch64_sve_pmov_to_pred_lane_zero : SVE2_1VectorArg_Pred_Intrinsic;
36493649
def int_aarch64_sve_pmov_to_vector_lane_merging : SVE2_Pred_1VectorArgIndexed_Intrinsic;
36503650

36513651
def int_aarch64_sve_pmov_to_vector_lane_zeroing : SVE2_Pred_1VectorArg_Intrinsic;
3652+
3653+
def int_aarch64_sme_mopa_nonwide : SME_OuterProduct_Intrinsic;
3654+
def int_aarch64_sme_mops_nonwide : SME_OuterProduct_Intrinsic;

llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -815,8 +815,8 @@ defm FMLS_VG4_M4Z2Z_H : sme2_dot_mla_add_sub_array_vg4_multi<"fmls", 0b0100011,
815815
defm FCVT_2ZZ_H : sme2p1_fp_cvt_vector_vg2_single<"fcvt", 0b0>;
816816
defm FCVTL_2ZZ_H : sme2p1_fp_cvt_vector_vg2_single<"fcvtl", 0b1>;
817817

818-
defm FMOPA_MPPZZ_H : sme2p1_fmop_tile_fp16<"fmopa", 0b0, 0b0, 0b11, ZPR16>;
819-
defm FMOPS_MPPZZ_H : sme2p1_fmop_tile_fp16<"fmops", 0b0, 0b1, 0b11, ZPR16>;
818+
defm FMOPA_MPPZZ_H : sme2p1_fmop_tile_fp16<"fmopa", 0b0, 0b0, nxv8f16, int_aarch64_sme_mopa>;
819+
defm FMOPS_MPPZZ_H : sme2p1_fmop_tile_fp16<"fmops", 0b0, 0b1, nxv8f16, int_aarch64_sme_mops>;
820820
}
821821

822822
let Predicates = [HasSME2, HasB16B16] in {
@@ -862,8 +862,8 @@ defm BFMINNM_VG4_4Z2Z : sme2p1_bf_max_min_vector_vg4_multi<"bfminnm", 0b0010011
862862
defm BFCLAMP_VG2_2ZZZ: sme2p1_bfclamp_vector_vg2_multi<"bfclamp">;
863863
defm BFCLAMP_VG4_4ZZZ: sme2p1_bfclamp_vector_vg4_multi<"bfclamp">;
864864

865-
defm BFMOPA_MPPZZ_H : sme2p1_fmop_tile_fp16<"bfmopa", 0b1, 0b0, 0b11, ZPR16>;
866-
defm BFMOPS_MPPZZ_H : sme2p1_fmop_tile_fp16<"bfmops", 0b1, 0b1, 0b11, ZPR16>;
865+
defm BFMOPA_MPPZZ_H : sme2p1_fmop_tile_fp16<"bfmopa", 0b1, 0b0, nxv8bf16, int_aarch64_sme_mopa>;
866+
defm BFMOPS_MPPZZ_H : sme2p1_fmop_tile_fp16<"bfmops", 0b1, 0b1, nxv8bf16, int_aarch64_sme_mops>;
867867
}
868868

869869
let Predicates = [HasSME2, HasFP8] in {
@@ -925,7 +925,7 @@ defm FMLAL_VG4_M4ZZ_BtoH : sme2_fp_mla_long_array_vg4_single<"fmlal", 0b001, M
925925
defm FMLAL_VG2_M2Z2Z_BtoH : sme2_fp_mla_long_array_vg2_multi<"fmlal", 0b100, MatrixOp16, ZZ_b_mul_r, nxv16i8, null_frag>;
926926
defm FMLAL_VG4_M4Z4Z_BtoH : sme2_fp_mla_long_array_vg4_multi<"fmlal", 0b100, MatrixOp16, ZZZZ_b_mul_r, nxv16i8, null_frag>;
927927

928-
defm FMOPA_MPPZZ_BtoH : sme2p1_fmop_tile_fp16<"fmopa", 0b1, 0b0, 0b01, ZPR8>;
928+
defm FMOPA_MPPZZ_BtoH : sme2p1_fmop_tile_f8f16<"fmopa", 0b1, 0b0, 0b01>;
929929

930930
} //[HasSMEF8F16]
931931

llvm/lib/Target/AArch64/SMEInstrFormats.td

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -286,14 +286,26 @@ multiclass sme_outer_product_fp64<bit S, string mnemonic, SDPatternOperator op>
286286
def : SME_ZA_Tile_TwoPred_TwoVec_Pat<NAME, op, timm32_0_7, nxv2i1, nxv2f64>;
287287
}
288288

289-
multiclass sme2p1_fmop_tile_fp16<string mnemonic, bit bf, bit s, bits<2> op, ZPRRegOp zpr_ty>{
290-
def NAME : sme_fp_outer_product_inst<s, {0,bf}, op, TileOp16, zpr_ty, mnemonic> {
289+
multiclass sme2p1_fmop_tile_f8f16<string mnemonic, bit bf, bit s, bits<2> op> {
290+
def NAME : sme_fp_outer_product_inst<s, {0,bf}, op, TileOp16, ZPR8, mnemonic> {
291291
bits<1> ZAda;
292292
let Inst{2-1} = 0b00;
293293
let Inst{0} = ZAda;
294294
}
295295
}
296296

297+
multiclass sme2p1_fmop_tile_fp16<string mnemonic, bit bf, bit s, ValueType vt, SDPatternOperator intrinsic = null_frag> {
298+
def NAME : sme_fp_outer_product_inst<s, {0,bf}, 0b11, TileOp16, ZPR16, mnemonic>, SMEPseudo2Instr<NAME, 1> {
299+
bits<1> ZAda;
300+
let Inst{2-1} = 0b00;
301+
let Inst{0} = ZAda;
302+
}
303+
304+
def NAME # _PSEUDO : sme_outer_product_pseudo<ZPR16, SMEMatrixTileH>, SMEPseudo2Instr<NAME, 0>;
305+
306+
def : SME_ZA_Tile_TwoPred_TwoVec_Pat<NAME, intrinsic, timm32_0_1, nxv8i1, vt>;
307+
}
308+
297309
class sme_int_outer_product_inst<bits<3> opc, bit sz, bit sme2,
298310
MatrixTileOperand za_ty, ZPRRegOp zpr_ty,
299311
string mnemonic>
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2+
; RUN: llc -verify-machineinstrs < %s | FileCheck %s
3+
4+
target triple = "aarch64-linux"
5+
6+
define void @mopa_bf16(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) #0 {
7+
; CHECK-LABEL: mopa_bf16:
8+
; CHECK: // %bb.0:
9+
; CHECK-NEXT: bfmopa za0.h, p0/m, p1/m, z0.h, z1.h
10+
; CHECK-NEXT: ret
11+
call void @llvm.aarch64.sme.mopa.nxv8bf16(i32 0, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
12+
ret void
13+
}
14+
15+
define void @mopa_f16(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm) #0 {
16+
; CHECK-LABEL: mopa_f16:
17+
; CHECK: // %bb.0:
18+
; CHECK-NEXT: fmopa za1.h, p0/m, p1/m, z0.h, z1.h
19+
; CHECK-NEXT: ret
20+
call void @llvm.aarch64.sme.mopa.nxv8f16(i32 1, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
21+
ret void
22+
}
23+
24+
define void @mops_bf16(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) #0 {
25+
; CHECK-LABEL: mops_bf16:
26+
; CHECK: // %bb.0:
27+
; CHECK-NEXT: bfmops za0.h, p0/m, p1/m, z0.h, z1.h
28+
; CHECK-NEXT: ret
29+
call void @llvm.aarch64.sme.mops.nxv8bf16(i32 0, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
30+
ret void
31+
}
32+
33+
define void @mops_f16(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm) #0 {
34+
; CHECK-LABEL: mops_f16:
35+
; CHECK: // %bb.0:
36+
; CHECK-NEXT: fmops za1.h, p0/m, p1/m, z0.h, z1.h
37+
; CHECK-NEXT: ret
38+
call void @llvm.aarch64.sme.mops.nxv8f16(i32 1, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
39+
ret void
40+
}
41+
42+
attributes #0 = {nounwind "target-features" = "+sme,+sme2p1,+bf16,+sme-f16f16,+b16b16" }

0 commit comments

Comments
 (0)