From 3b6510da8fb3b9709839ea0c102355879b11aa6d Mon Sep 17 00:00:00 2001 From: Malay Sanghi Date: Tue, 5 Nov 2024 13:37:54 +0800 Subject: [PATCH 1/4] [X86][AMX] Support AMX-MOVRS Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368 --- clang/include/clang/Basic/BuiltinsX86_64.def | 14 ++ clang/include/clang/Driver/Options.td | 2 + clang/lib/Basic/Targets/X86.cpp | 6 + clang/lib/Basic/Targets/X86.h | 1 + clang/lib/CodeGen/CGBuiltin.cpp | 18 +- clang/lib/Headers/CMakeLists.txt | 1 + clang/lib/Headers/amxmovrsintrin.h | 48 +++++ clang/lib/Headers/amxtransposeintrin.h | 177 ++++++++++++++++++ clang/lib/Headers/immintrin.h | 4 + clang/lib/Sema/SemaX86.cpp | 6 + clang/test/CodeGen/X86/amx_movrs.c | 25 +++ clang/test/CodeGen/X86/amx_movrs_api.c | 34 ++++ clang/test/CodeGen/X86/amx_movrs_errors.c | 14 ++ clang/test/CodeGen/X86/amx_movrs_tranpose.c | 53 ++++++ .../test/CodeGen/X86/amx_movrs_tranpose_api.c | 81 ++++++++ .../CodeGen/X86/amx_movrs_transpose_errors.c | 22 +++ llvm/include/llvm/IR/IntrinsicsX86.td | 48 +++++ llvm/lib/Target/X86/X86.td | 3 + llvm/lib/Target/X86/X86ExpandPseudo.cpp | 35 ++++ llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 109 ++++++++++- llvm/lib/Target/X86/X86ISelLowering.cpp | 81 ++++++++ llvm/lib/Target/X86/X86InstrAMX.td | 91 +++++++++ llvm/lib/Target/X86/X86InstrInfo.cpp | 1 + llvm/lib/Target/X86/X86InstrPredicates.td | 1 + llvm/lib/Target/X86/X86LowerAMXType.cpp | 8 +- llvm/lib/Target/X86/X86RegisterInfo.cpp | 10 +- llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll | 108 +++++++++++ .../X86/amx_movrs_transpose_intrinsics.ll | 92 +++++++++ .../Disassembler/X86/AMX/x86-64-amx-movrs.txt | 98 ++++++++++ llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s | 89 +++++++++ llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s | 97 ++++++++++ 31 files changed, 1371 insertions(+), 6 deletions(-) create mode 100644 clang/lib/Headers/amxmovrsintrin.h create mode 100755 clang/test/CodeGen/X86/amx_movrs.c create mode 100755 clang/test/CodeGen/X86/amx_movrs_api.c create mode 100755 clang/test/CodeGen/X86/amx_movrs_errors.c create mode 100755 clang/test/CodeGen/X86/amx_movrs_tranpose.c create mode 100755 clang/test/CodeGen/X86/amx_movrs_tranpose_api.c create mode 100755 clang/test/CodeGen/X86/amx_movrs_transpose_errors.c create mode 100755 llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll create mode 100755 llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll create mode 100755 llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt create mode 100755 llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s create mode 100755 llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def index d95e8455a304b..98235023bddc7 100644 --- a/clang/include/clang/Basic/BuiltinsX86_64.def +++ b/clang/include/clang/Basic/BuiltinsX86_64.def @@ -117,7 +117,9 @@ TARGET_BUILTIN(__builtin_ia32_uwrmsr, "vULLiULLi", "n", "usermsr") // AMX internal builtin TARGET_BUILTIN(__builtin_ia32_tile_loadconfig_internal, "vvC*", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tileloadd64_internal, "V256iUsUsvC*z", "n", "amx-tile") +TARGET_BUILTIN(__builtin_ia32_tileloaddrs64_internal, "V256iUsUsvC*z", "n", "amx-movrs") TARGET_BUILTIN(__builtin_ia32_tileloaddt164_internal, "V256iUsUsvC*z", "n", "amx-tile") +TARGET_BUILTIN(__builtin_ia32_tileloaddrst164_internal, "V256iUsUsvC*z", "n", "amx-movrs") TARGET_BUILTIN(__builtin_ia32_tdpbssd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8") TARGET_BUILTIN(__builtin_ia32_tdpbsud_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8") TARGET_BUILTIN(__builtin_ia32_tdpbusd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8") @@ -129,15 +131,27 @@ TARGET_BUILTIN(__builtin_ia32_tdpfp16ps_internal, "V256iUsUsUsV256iV256iV256i", TARGET_BUILTIN(__builtin_ia32_tcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex") TARGET_BUILTIN(__builtin_ia32_tcmmrlfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex") TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rs_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose") TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rst1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose") TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rs_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose") TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rst1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose") TARGET_BUILTIN(__builtin_ia32_ttransposed_internal, "V256iUsUsV256i", "n", "amx-transpose") + // AMX TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tilerelease, "v", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tilezero, "vUc", "n", "amx-tile") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rs, "vIUcvC*z", "n", "amx-movrs,amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rst1, "vIUcvC*z", "n", "amx-movrs,amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rs, "vIUcvC*z", "n", "amx-movrs,amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rst1, "vIUcvC*z", "n", "amx-movrs,amx-transpose") + +TARGET_BUILTIN(__builtin_ia32_tileloaddrs64, "vIUcvC*z", "n", "amx-movrs") +TARGET_BUILTIN(__builtin_ia32_tileloaddrst164, "vIUcvC*z", "n", "amx-movrs") TARGET_BUILTIN(__builtin_ia32_tileloadd64, "vIUcvC*z", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tileloaddt164, "vIUcvC*z", "n", "amx-tile") diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 805b79491e6ea..c36adb673dd4e 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -6303,6 +6303,8 @@ def mamx_tile : Flag<["-"], "mamx-tile">, Group; def mno_amx_tile : Flag<["-"], "mno-amx-tile">, Group; def mamx_transpose : Flag<["-"], "mamx-transpose">, Group; def mno_amx_transpose : Flag<["-"], "mno-amx-transpose">, Group; +def mamx_movrs: Flag<["-"], "mamx-movrs">, Group; +def mno_amx_movrs: Flag<["-"], "mno-amx-movrs">, Group; def mcmpccxadd : Flag<["-"], "mcmpccxadd">, Group; def mno_cmpccxadd : Flag<["-"], "mno-cmpccxadd">, Group; def msse : Flag<["-"], "msse">, Group; diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp index d7d3adef42c79..0ddc1ac4c47f2 100644 --- a/clang/lib/Basic/Targets/X86.cpp +++ b/clang/lib/Basic/Targets/X86.cpp @@ -430,6 +430,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector &Features, HasAMXCOMPLEX = true; } else if (Feature == "+amx-fp8") { HasAMXFP8 = true; + } else if (Feature == "+amx-movrs") { + HasAMXMOVRS = true; } else if (Feature == "+amx-transpose") { HasAMXTRANSPOSE = true; } else if (Feature == "+cmpccxadd") { @@ -953,6 +955,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts, Builder.defineMacro("__AMX_COMPLEX__"); if (HasAMXFP8) Builder.defineMacro("__AMX_FP8__"); + if (HasAMXMOVRS) + Builder.defineMacro("__AMX_MOVRS__"); if (HasAMXTRANSPOSE) Builder.defineMacro("__AMX_TRANSPOSE__"); if (HasCMPCCXADD) @@ -1085,6 +1089,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const { .Case("amx-fp16", true) .Case("amx-fp8", true) .Case("amx-int8", true) + .Case("amx-movrs", true) .Case("amx-tile", true) .Case("amx-transpose", true) .Case("avx", true) @@ -1205,6 +1210,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const { .Case("amx-fp16", HasAMXFP16) .Case("amx-fp8", HasAMXFP8) .Case("amx-int8", HasAMXINT8) + .Case("amx-movrs", HasAMXMOVRS) .Case("amx-tile", HasAMXTILE) .Case("amx-transpose", HasAMXTRANSPOSE) .Case("avx", SSELevel >= AVX) diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h index e2eba63b99235..54a078d2f137b 100644 --- a/clang/lib/Basic/Targets/X86.h +++ b/clang/lib/Basic/Targets/X86.h @@ -158,6 +158,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo { bool HasAMXBF16 = false; bool HasAMXCOMPLEX = false; bool HasAMXFP8 = false; + bool HasAMXMOVRS = false; bool HasAMXTRANSPOSE = false; bool HasSERIALIZE = false; bool HasTSXLDTRK = false; diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 34fedd6711475..02ee0132bbb5e 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -16996,9 +16996,13 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, } // Corresponding to intrisics which will return 2 tiles (tile0_tile1). case X86::BI__builtin_ia32_t2rpntlvwz0_internal: + case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal: case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal: + case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal: case X86::BI__builtin_ia32_t2rpntlvwz1_internal: - case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: { + case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal: + case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: + case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal: { Intrinsic::ID IID; switch (BuiltinID) { default: @@ -17006,15 +17010,27 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_t2rpntlvwz0_internal: IID = Intrinsic::x86_t2rpntlvwz0_internal; break; + case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal: + IID = Intrinsic::x86_t2rpntlvwz0rs_internal; + break; case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal: IID = Intrinsic::x86_t2rpntlvwz0t1_internal; break; + case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal: + IID = Intrinsic::x86_t2rpntlvwz0rst1_internal; + break; case X86::BI__builtin_ia32_t2rpntlvwz1_internal: IID = Intrinsic::x86_t2rpntlvwz1_internal; break; + case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal: + IID = Intrinsic::x86_t2rpntlvwz1rs_internal; + break; case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: IID = Intrinsic::x86_t2rpntlvwz1t1_internal; break; + case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal: + IID = Intrinsic::x86_t2rpntlvwz1rst1_internal; + break; } // Ops = (Row0, Col0, Col1, DstPtr0, DstPtr1, SrcPtr, Stride) diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index 67242cd4d981b..a50cf01eac6fe 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -151,6 +151,7 @@ set(x86_files amxfp8intrin.h amxintrin.h amxtransposeintrin.h + amxmovrsintrin.h avx10_2_512bf16intrin.h avx10_2_512convertintrin.h avx10_2_512minmaxintrin.h diff --git a/clang/lib/Headers/amxmovrsintrin.h b/clang/lib/Headers/amxmovrsintrin.h new file mode 100644 index 0000000000000..5fe2fdecb8b5d --- /dev/null +++ b/clang/lib/Headers/amxmovrsintrin.h @@ -0,0 +1,48 @@ +/*===-------- amxmovrsintrin.h - AMX MOVRS intrinsics -*- C++ -*---------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + * ===-------------------------------------------------------------------=== */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif /* __IMMINTRIN_H */ + +#ifndef __AMXMOVRSINTRIN_H +#define __AMXMOVRSINTRIN_H +#ifdef __x86_64__ + +#define __DEFAULT_FN_ATTRS_MOVRS \ + __attribute__((__always_inline__, __nodebug__, __target__("amx-movrs"))) + +#define _tile_loaddrs(dst, base, stride) \ + __builtin_ia32_tileloaddrs64((dst), ((const void *)(base)), \ + (__SIZE_TYPE__)(stride)) +#define _tile_stream_loaddrs(dst, base, stride) \ + __builtin_ia32_tileloaddrst164((dst), ((const void *)(base)), \ + (__SIZE_TYPE__)(stride)) +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_MOVRS +_tile_loaddrs_internal(unsigned short m, unsigned short n, const void *base, + __SIZE_TYPE__ stride) { + return __builtin_ia32_tileloaddrs64_internal(m, n, base, + (__SIZE_TYPE__)(stride)); +} +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_MOVRS +_tile_loaddrst1_internal(unsigned short m, unsigned short n, const void *base, + __SIZE_TYPE__ stride) { + return __builtin_ia32_tileloaddrst164_internal(m, n, base, + (__SIZE_TYPE__)(stride)); +} +static __inline__ void __DEFAULT_FN_ATTRS_MOVRS +__tile_loaddrs(__tile1024i *dst, const void *base, __SIZE_TYPE__ stride) { + dst->tile = _tile_loaddrs_internal(dst->row, dst->col, base, stride); +} +static __inline__ void __DEFAULT_FN_ATTRS_MOVRS __tile_stream_loaddrs( + __tile1024i *dst, const void *base, __SIZE_TYPE__ stride) { + dst->tile = _tile_loaddrst1_internal(dst->row, dst->col, base, stride); +} +#undef __DEFAULT_FN_ATTRS_MOVRS +#endif /* __x86_64__ */ +#endif /* __AMXMOVRSINTRIN_H */ diff --git a/clang/lib/Headers/amxtransposeintrin.h b/clang/lib/Headers/amxtransposeintrin.h index b3fa37d766c45..086c9a75222ca 100644 --- a/clang/lib/Headers/amxtransposeintrin.h +++ b/clang/lib/Headers/amxtransposeintrin.h @@ -17,6 +17,9 @@ #define __DEFAULT_FN_ATTRS_TRANSPOSE \ __attribute__((__always_inline__, __nodebug__, __target__("amx-transpose"))) +#define __DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("amx-transpose,amx-movrs"))) #define _tile_2rpntlvwz0(tdst, base, stride) \ __builtin_ia32_t2rpntlvwz0(tdst, base, stride) @@ -26,6 +29,15 @@ __builtin_ia32_t2rpntlvwz1(tdst, base, stride) #define _tile_2rpntlvwz1t1(tdst, base, stride) \ __builtin_ia32_t2rpntlvwz1t1(tdst, base, stride) +// MOVRS versions +#define _tile_2rpntlvwz0rs(tdst, base, stride) \ + __builtin_ia32_t2rpntlvwz0rs(tdst, base, stride) +#define _tile_2rpntlvwz0rst1(tdst, base, stride) \ + __builtin_ia32_t2rpntlvwz0rst1(tdst, base, stride) +#define _tile_2rpntlvwz1rs(tdst, base, stride) \ + __builtin_ia32_t2rpntlvwz1rs(tdst, base, stride) +#define _tile_2rpntlvwz1rst1(tdst, base, stride) \ + __builtin_ia32_t2rpntlvwz1rst1(tdst, base, stride) /// Transpose 32-bit elements from \a src and write the result to \a dst. /// @@ -101,6 +113,45 @@ _tile_transposed_internal(unsigned short m, unsigned short n, _tile1024i src) { return __builtin_ia32_ttransposed_internal(m, n, src); } +static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS +_tile_2rpntlvwz0rs_internal(unsigned short row, unsigned short col0, + unsigned short col1, _tile1024i *dst0, + _tile1024i *dst1, const void *base, + __SIZE_TYPE__ stride) { + // Use __tile1024i_1024a* to escape the alignment check in + // clang/test/Headers/x86-intrinsics-headers-clean.cpp + __builtin_ia32_t2rpntlvwz0rs_internal( + row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, + (__SIZE_TYPE__)(stride)); +} +static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS +_tile_2rpntlvwz0rst1_internal(unsigned short row, unsigned short col0, + unsigned short col1, _tile1024i *dst0, + _tile1024i *dst1, const void *base, + __SIZE_TYPE__ stride) { + __builtin_ia32_t2rpntlvwz0rst1_internal( + row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, + (__SIZE_TYPE__)(stride)); +} +static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS +_tile_2rpntlvwz1rs_internal(unsigned short row, unsigned short col0, + unsigned short col1, _tile1024i *dst0, + _tile1024i *dst1, const void *base, + __SIZE_TYPE__ stride) { + __builtin_ia32_t2rpntlvwz1rs_internal( + row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, + (__SIZE_TYPE__)(stride)); +} +static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS +_tile_2rpntlvwz1rst1_internal(unsigned short row, unsigned short col0, + unsigned short col1, _tile1024i *dst0, + _tile1024i *dst1, const void *base, + __SIZE_TYPE__ stride) { + __builtin_ia32_t2rpntlvwz1rst1_internal( + row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, + (__SIZE_TYPE__)(stride)); +} + /// Converts a pair of tiles from memory into VNNI format, and places the /// results in a pair of destinations specified by dst. The pair of tiles /// in memory is specified via a tsib; the second tile is after the first @@ -229,6 +280,131 @@ static void __tile_2rpntlvwz1t1(__tile1024i *dst0, __tile1024i *dst1, &dst1->tile, base, stride); } +/// Converts a pair of tiles from memory into VNNI format, and places the +/// results in a pair of destinations specified by dst. The pair of tiles +/// in memory is specified via a tsib; the second tile is after the first +/// one, separated by the same stride that separates each row. +/// The tile configuration for the destination tiles indicates the amount +/// of data to read from memory. The instruction will load a number of rows +/// that is equal to twice the number of rows in tmm1. The size of each row +/// is equal to the average width of the destination tiles. If the second +/// tile is configured with zero rows and columns, only the first tile will +/// be written. +/// Provides a hint to the implementation that the data will likely become +/// read shared in the near future and the data caching can be optimized. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the T2RPNTLVWZ0RS instruction. +/// +/// \param dst0 +/// First tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param dst1 +/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS +static void __tile_2rpntlvwz0rs(__tile1024i *dst0, __tile1024i *dst1, + const void *base, __SIZE_TYPE__ stride) { + _tile_2rpntlvwz0rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, + &dst1->tile, base, stride); +} +/// Converts a pair of tiles from memory into VNNI format, and places the +/// results in a pair of destinations specified by dst. The pair of tiles +/// in memory is specified via a tsib; the second tile is after the first +/// one, separated by the same stride that separates each row. +/// The tile configuration for the destination tiles indicates the amount +/// of data to read from memory. The instruction will load a number of rows +/// that is equal to twice the number of rows in tmm1. The size of each row +/// is equal to the average width of the destination tiles. If the second +/// tile is configured with zero rows and columns, only the first tile will +/// be written. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the T2RPNTLVWZ0T1RS instruction. +/// +/// \param dst0 +/// First tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param dst1 +/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS +static void __tile_2rpntlvwz0rst1(__tile1024i *dst0, __tile1024i *dst1, + const void *base, __SIZE_TYPE__ stride) { + _tile_2rpntlvwz0rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, + &dst1->tile, base, stride); +} +/// Converts a pair of tiles from memory into VNNI format, and places the +/// results in a pair of destinations specified by dst. The pair of tiles +/// in memory is specified via a tsib; the second tile is after the first +/// one, separated by the same stride that separates each row. +/// The tile configuration for the destination tiles indicates the amount +/// of data to read from memory. The instruction will load a number of rows +/// that is equal to twice the number of rows in tmm1. The size of each row +/// is equal to the average width of the destination tiles. If the second +/// tile is configured with zero rows and columns, only the first tile will +/// be written. The last row will be not be read from memory but instead +/// filled with zeros. +/// Provides a hint to the implementation that the data will likely become +/// read shared in the near future and the data caching can be optimized. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the T2RPNTLVWZ1 instruction. +/// +/// \param dst0 +/// First tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param dst1 +/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS +static void __tile_2rpntlvwz1rs(__tile1024i *dst0, __tile1024i *dst1, + const void *base, __SIZE_TYPE__ stride) { + _tile_2rpntlvwz1rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, + &dst1->tile, base, stride); +} +/// Converts a pair of tiles from memory into VNNI format, and places the +/// results in a pair of destinations specified by dst. The pair of tiles +/// in memory is specified via a tsib; the second tile is after the first +/// one, separated by the same stride that separates each row. +/// The tile configuration for the destination tiles indicates the amount +/// of data to read from memory. The instruction will load a number of rows +/// that is equal to twice the number of rows in tmm1. The size of each row +/// is equal to the average width of the destination tiles. If the second +/// tile is configured with zero rows and columns, only the first tile will +/// be written. The last row will be not be read from memory but instead +/// filled with zeros. +/// Provides a hint to the implementation that the data will likely become +/// read shared in the near future and the data caching can be optimized. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the T2RPNTLVWZ1T1RS instruction. +/// +/// \param dst0 +/// First tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param dst1 +/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS +static void __tile_2rpntlvwz1rst1(__tile1024i *dst0, __tile1024i *dst1, + const void *base, __SIZE_TYPE__ stride) { + _tile_2rpntlvwz1rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, + &dst1->tile, base, stride); +} + /// Transpose 32-bit elements from src and write the result to dst. /// /// \headerfile @@ -244,5 +420,6 @@ static void __tile_transposed(__tile1024i *dst, __tile1024i src) { dst->tile = _tile_transposed_internal(dst->row, dst->col, src.tile); } +#undef __DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS #endif /* __x86_64__ */ #endif /* __AMX_TRANSPOSEINTRIN_H */ diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h index 4bf7eac4195ee..5035f02d889e7 100644 --- a/clang/lib/Headers/immintrin.h +++ b/clang/lib/Headers/immintrin.h @@ -656,6 +656,10 @@ _storebe_i64(void * __P, long long __D) { #include #endif +#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_MOVRS__) +#include +#endif + #if !defined(__SCE__) || __has_feature(modules) || \ defined(__AVX512VP2INTERSECT__) #include diff --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp index ef878d16d445f..4d3b0292a56a9 100644 --- a/clang/lib/Sema/SemaX86.cpp +++ b/clang/lib/Sema/SemaX86.cpp @@ -629,12 +629,18 @@ bool SemaX86::CheckBuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) { return false; case X86::BI__builtin_ia32_tileloadd64: case X86::BI__builtin_ia32_tileloaddt164: + case X86::BI__builtin_ia32_tileloaddrs64: + case X86::BI__builtin_ia32_tileloaddrst164: case X86::BI__builtin_ia32_tilestored64: case X86::BI__builtin_ia32_tilezero: case X86::BI__builtin_ia32_t2rpntlvwz0: case X86::BI__builtin_ia32_t2rpntlvwz0t1: case X86::BI__builtin_ia32_t2rpntlvwz1: case X86::BI__builtin_ia32_t2rpntlvwz1t1: + case X86::BI__builtin_ia32_t2rpntlvwz0rst1: + case X86::BI__builtin_ia32_t2rpntlvwz1rs: + case X86::BI__builtin_ia32_t2rpntlvwz1rst1: + case X86::BI__builtin_ia32_t2rpntlvwz0rs: return CheckBuiltinTileArgumentsRange(TheCall, 0); case X86::BI__builtin_ia32_tdpbssd: case X86::BI__builtin_ia32_tdpbsud: diff --git a/clang/test/CodeGen/X86/amx_movrs.c b/clang/test/CodeGen/X86/amx_movrs.c new file mode 100755 index 0000000000000..4a8f001baafce --- /dev/null +++ b/clang/test/CodeGen/X86/amx_movrs.c @@ -0,0 +1,25 @@ +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \ +// RUN: -target-feature +amx-movrs -target-feature +avx512f \ +// RUN: -emit-llvm -o - -Wall -Werror -pedantic \ +// RUN: -Wno-gnu-statement-expression| FileCheck %s + +#include +#include + +#define STRIDE 32 + +char buf[1024]; + +void test_tile_loadd(short row, short col) { + // CHECK-LABEL: define dso_local void @test_tile_loadd( + // CHECK: call x86_amx @llvm.x86.tileloaddrs64.internal(i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}}) + // CHECK-NEXT: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) + _tile_loaddrs_internal(row, col, buf, STRIDE); +} + +void test_tile_loaddt1(short row, short col) { + // CHECK-LABEL: define dso_local void @test_tile_loaddt1( + // CHECK: call x86_amx @llvm.x86.tileloaddrst164.internal(i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}}) + // CHECK-NEXT: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) + _tile_loaddrst1_internal(row, col, buf, STRIDE); +} diff --git a/clang/test/CodeGen/X86/amx_movrs_api.c b/clang/test/CodeGen/X86/amx_movrs_api.c new file mode 100755 index 0000000000000..cf430adf14085 --- /dev/null +++ b/clang/test/CodeGen/X86/amx_movrs_api.c @@ -0,0 +1,34 @@ +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \ +// RUN: -target-feature +amx-movrs -emit-llvm -o - -Wall -Werror -pedantic \ +// RUN: -Wno-gnu-statement-expression| FileCheck %s + +#include +#include + +#define STRIDE 32 + +char buf[1024]; + +void test_tile_loadd(short row) { + // CHECK-LABEL: define dso_local void @test_tile_loadd( + // CHECK: call x86_amx @llvm.x86.tileloaddrs64.internal(i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}}) + // CHECK-NEXT: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) + __tile1024i a = {row, 8}; + __tile_loaddrs(&a, buf, STRIDE); +} + +void test_tile_loaddt1(short row) { + // CHECK-LABEL: define dso_local void @test_tile_loaddt1( + // CHECK: call x86_amx @llvm.x86.tileloaddrst164.internal(i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}}) + // CHECK-NEXT: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) + __tile1024i a = {row, 8}; + __tile_stream_loaddrs(&a, buf, STRIDE); +} + +void test_tile_loadd_macro(void *data) { + // CHECK-LABEL: define dso_local void @test_tile_loadd_macro( + // CHECK: call void @llvm.x86.tileloaddrs64(i8 {{.*}}, ptr %{{.*}}, i64 {{.*}}) + // CHECK: call void @llvm.x86.tileloaddrst164(i8 {{.*}}, ptr %{{.*}}, i64 {{.*}}) + _tile_loaddrs(4, data, STRIDE); + _tile_stream_loaddrs(2, data, STRIDE); +} diff --git a/clang/test/CodeGen/X86/amx_movrs_errors.c b/clang/test/CodeGen/X86/amx_movrs_errors.c new file mode 100755 index 0000000000000..bac7d962f5cb5 --- /dev/null +++ b/clang/test/CodeGen/X86/amx_movrs_errors.c @@ -0,0 +1,14 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \ +// RUN: -target-feature +amx-int8 -target-feature +amx-bf16 -target-feature +amx-reduce -target-feature +amx-memory \ +// RUN: -target-feature +amx-format -target-feature +amx-element -emit-llvm -verify + +#include +#include + +char buf[1024]; + +void test_tile_load() { + _tile_loaddrs(20, buf, 32); // expected-error {{argument value 20 is outside the valid range [0, 7]}} + _tile_stream_loaddrs(-1, buf, 20); // expected-error {{argument value 255 is outside the valid range [0, 7]}} +} diff --git a/clang/test/CodeGen/X86/amx_movrs_tranpose.c b/clang/test/CodeGen/X86/amx_movrs_tranpose.c new file mode 100755 index 0000000000000..192c153835e1e --- /dev/null +++ b/clang/test/CodeGen/X86/amx_movrs_tranpose.c @@ -0,0 +1,53 @@ +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \ +// RUN: -target-feature +amx-movrs -emit-llvm -o - -Wall -Werror -pedantic \ +// RUN: -target-feature +amx-transpose -Wno-gnu-statement-expression| FileCheck %s + +#include +#include + +char buf[2048]; +#define STRIDE 32 + +// CHECK-LABEL: define dso_local void @test_tile_2rpntlvwz0rs_internal( +// CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}}) +// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 0 +// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) +// CHECK: store <256 x i32> %{{.*}}, ptr %{{.*}}, align 1024 +// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 1 +// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) +void test_tile_2rpntlvwz0rs_internal(int row, int col0, int col1, void *D0, void *D1, void *B) { + _tile_2rpntlvwz0rs_internal(row, col0, col1, D0, D1, B, 1); +} + +// CHECK-LABEL: define dso_local void @test_tile_2rpntlvwz0rst1_internal( +// CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}}) +// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 0 +// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) +// CHECK: store <256 x i32> %{{.*}}, ptr %{{.*}}, align 1024 +// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 1 +// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) +void test_tile_2rpntlvwz0rst1_internal(int row, int col0, int col1, void *D0, void *D1, void *B) { + _tile_2rpntlvwz0rst1_internal(row, col0, col1, D0, D1, B, 1); +} + +// CHECK-LABEL: define dso_local void @test_tile_2rpntlvwz1rs_internal( +// CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}}) +// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 0 +// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) +// CHECK: store <256 x i32> %{{.*}}, ptr %{{.*}}, align 1024 +// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 1 +// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) +void test_tile_2rpntlvwz1rs_internal(int row, int col0, int col1, void *D0, void *D1, void *B) { + _tile_2rpntlvwz1rs_internal(row, col0, col1, D0, D1, B, 1); +} + +// CHECK-LABEL: define dso_local void @test_tile_2rpntlvwz1rst1_internal( +// CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}}) +// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 0 +// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) +// CHECK: store <256 x i32> %{{.*}}, ptr %{{.*}}, align 1024 +// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 1 +// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) +void test_tile_2rpntlvwz1rst1_internal(int row, int col0, int col1, void *D0, void *D1, void *B) { + _tile_2rpntlvwz1rst1_internal(row, col0, col1, D0, D1, B, 1); +} diff --git a/clang/test/CodeGen/X86/amx_movrs_tranpose_api.c b/clang/test/CodeGen/X86/amx_movrs_tranpose_api.c new file mode 100755 index 0000000000000..b174cc5067bf3 --- /dev/null +++ b/clang/test/CodeGen/X86/amx_movrs_tranpose_api.c @@ -0,0 +1,81 @@ +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \ +// RUN: -target-feature +amx-movrs -emit-llvm -o - -Wall -Werror -pedantic \ +// RUN: -target-feature +amx-transpose -Wno-gnu-statement-expression| FileCheck %s + +#include +#include + +char buf[2048]; +#define STRIDE 32 + +void test_tile_2rpntlvwz0rs(const void *A, size_t B) { + // CHECK-LABEL: @test_tile_2rpntlvwz0rs + // CHECK: call void @llvm.x86.t2rpntlvwz0rs(i8 1, ptr %{{.*}}, i64 %{{.*}}) + _tile_2rpntlvwz0rs(1, A, B); +} + +void test_tile_2rpntlvwz0rst1(const void *A, size_t B) { + // CHECK-LABEL: @test_tile_2rpntlvwz0rst1 + // CHECK: call void @llvm.x86.t2rpntlvwz0rst1(i8 1, ptr %{{.*}}, i64 %{{.*}}) + _tile_2rpntlvwz0rst1(1, A, B); +} + +void test_tile_2rpntlvwz1rs(const void *A, size_t B) { + // CHECK-LABEL: @test_tile_2rpntlvwz1rs + // CHECK: call void @llvm.x86.t2rpntlvwz1rs(i8 1, ptr %{{.*}}, i64 %{{.*}}) + _tile_2rpntlvwz1rs(1, A, B); +} + +void test_tile_2rpntlvwz1rst1(const void *A, size_t B) { + // CHECK-LABEL: @test_tile_2rpntlvwz1rst1 + // CHECK: call void @llvm.x86.t2rpntlvwz1rst1(i8 1, ptr %{{.*}}, i64 %{{.*}}) + _tile_2rpntlvwz1rst1(1, A, B); +} + +void test__tile_2rpntlvwz0rs(__tile1024i dst0, __tile1024i dst1) { + //CHECK-LABEL: @test__tile_2rpntlvwz0rs + //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal + //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0 + //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} + //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1 + //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} + __tile_2rpntlvwz0rs(&dst0, &dst1, buf, STRIDE); +} + +void test__tile_2rpntlvwz0rst1(__tile1024i dst0, __tile1024i dst1) { + //CHECK-LABEL: @test__tile_2rpntlvwz0rst1 + //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal + //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0 + //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} + //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1 + //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} + __tile_2rpntlvwz0rst1(&dst0, &dst1, buf, STRIDE); +} + +void test__tile_2rpntlvwz1rs(__tile1024i dst0, __tile1024i dst1) { + //CHECK-LABEL: @test__tile_2rpntlvwz1rs + //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal + //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0 + //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} + //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1 + //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} + __tile_2rpntlvwz1rs(&dst0, &dst1, buf, STRIDE); +} + +void test__tile_2rpntlvwz1rst1(__tile1024i dst0, __tile1024i dst1) { + //CHECK-LABEL: @test__tile_2rpntlvwz1rst1 + //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal + //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0 + //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} + //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1 + //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} + __tile_2rpntlvwz1rst1(&dst0, &dst1, buf, STRIDE); +} diff --git a/clang/test/CodeGen/X86/amx_movrs_transpose_errors.c b/clang/test/CodeGen/X86/amx_movrs_transpose_errors.c new file mode 100755 index 0000000000000..c8846b36ffa87 --- /dev/null +++ b/clang/test/CodeGen/X86/amx_movrs_transpose_errors.c @@ -0,0 +1,22 @@ +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \ +// RUN: -target-feature +amx-int8 -target-feature +amx-transpose -target-feature +amx-movrs \ +// RUN: -emit-llvm -verify + +#include +#include + +void test_tile_2rpntlvwz0rs(const void *A, size_t B) { + _tile_2rpntlvwz0rs(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} + +void test_tile_2rpntlvwz0rst1(const void *A, size_t B) { + _tile_2rpntlvwz0rst1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} + +void test_tile_2rpntlvwz1rs(const void *A, size_t B) { + _tile_2rpntlvwz1rs(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} + +void test_tile_2rpntlvwz1rst1(const void *A, size_t B) { + _tile_2rpntlvwz1rst1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index c42397024e45a..d37bda13c3c4a 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -5882,6 +5882,12 @@ let TargetPrefix = "x86" in { def int_x86_tilestored64 : ClangBuiltin<"__builtin_ia32_tilestored64">, Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], [ImmArg>]>; + def int_x86_tileloaddrs64 : ClangBuiltin<"__builtin_ia32_tileloaddrs64">, + Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], + [ImmArg>]>; + def int_x86_tileloaddrst164 : ClangBuiltin<"__builtin_ia32_tileloaddrst164">, + Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], + [ImmArg>]>; def int_x86_tdpbssd : ClangBuiltin<"__builtin_ia32_tdpbssd">, Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], [ImmArg>, ImmArg>, @@ -5952,6 +5958,20 @@ let TargetPrefix = "x86" in { Intrinsic<[], [llvm_i8_ty, llvm_i8_ty], [ImmArg>, ImmArg>]>; + // AMX-MORVS, AMX-TRANSPOSE + def int_x86_t2rpntlvwz0rs : ClangBuiltin<"__builtin_ia32_t2rpntlvwz0rs">, + Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], + [ImmArg>]>; + def int_x86_t2rpntlvwz0rst1 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz0rst1">, + Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], + [ImmArg>]>; + def int_x86_t2rpntlvwz1rs : ClangBuiltin<"__builtin_ia32_t2rpntlvwz1rs">, + Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], + [ImmArg>]>; + def int_x86_t2rpntlvwz1rst1 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz1rst1">, + Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], + [ImmArg>]>; + // AMX - internal intrinsics def int_x86_ldtilecfg_internal : ClangBuiltin<"__builtin_ia32_tile_loadconfig_internal">, @@ -5966,6 +5986,16 @@ let TargetPrefix = "x86" in { Intrinsic<[llvm_x86amx_ty], [llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], []>; + def int_x86_tileloaddrs64_internal : + ClangBuiltin<"__builtin_ia32_tileloaddrs64_internal">, + Intrinsic<[llvm_x86amx_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], + []>; + def int_x86_tileloaddrst164_internal : + ClangBuiltin<"__builtin_ia32_tileloaddrst164_internal">, + Intrinsic<[llvm_x86amx_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], + []>; def int_x86_tdpbssd_internal : ClangBuiltin<"__builtin_ia32_tdpbssd_internal">, Intrinsic<[llvm_x86amx_ty], @@ -6030,6 +6060,24 @@ let TargetPrefix = "x86" in { llvm_x86amx_ty, llvm_x86amx_ty, llvm_x86amx_ty], []>; + // AMX-MORVS, AMX-TRANSPOSE - internal intrinsics + def int_x86_t2rpntlvwz0rs_internal : + Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], + [IntrArgMemOnly, IntrReadMem]>; + def int_x86_t2rpntlvwz0rst1_internal : + Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], + [IntrArgMemOnly]>; + def int_x86_t2rpntlvwz1rs_internal : + Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], + [IntrArgMemOnly]>; + def int_x86_t2rpntlvwz1rst1_internal : + Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], + [IntrArgMemOnly]>; + def int_x86_t2rpntlvwz0_internal : Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty], [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 160e7c0fc0310..dfeffae6dec4f 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -273,6 +273,9 @@ def FeatureAMXCOMPLEX : SubtargetFeature<"amx-complex", "HasAMXCOMPLEX", "true", def FeatureAMXFP8 : SubtargetFeature<"amx-fp8", "HasAMXFP8", "true", "Support AMX-FP8 instructions", [FeatureAMXTILE]>; +def FeatureAMXMOVRS : SubtargetFeature<"amx-movrs", "HasAMXMOVRS", "true", + "Support AMX-MOVRS instructions", + [FeatureAMXTILE]>; def FeatureAMXTRANSPOSE : SubtargetFeature<"amx-transpose", "HasAMXTRANSPOSE", "true", "Support AMX amx-transpose instructions", [FeatureAMXTILE]>; diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp index f832955d1202f..94072502c829a 100644 --- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -558,6 +558,15 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, MI.setDesc(TII->get(GET_EGPR_IF_ENABLED(X86::LDTILECFG))); return true; } + case X86::PTILELOADDRSV: + case X86::PTILELOADDRST1V: { + for (unsigned i = 2; i > 0; --i) + MI.removeOperand(i); + unsigned Opc = + Opcode == X86::PTILELOADDRSV ? X86::TILELOADDRS : X86::TILELOADDRST1; + MI.setDesc(TII->get(Opc)); + return true; + } case X86::PTILELOADDV: case X86::PTILELOADDT1V: { for (unsigned i = 2; i > 0; --i) @@ -687,6 +696,32 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, MI.setDesc(TII->get(Opc)); return true; } + case X86::PT2RPNTLVWZ0RSV: + case X86::PT2RPNTLVWZ0RST1V: + case X86::PT2RPNTLVWZ1RSV: + case X86::PT2RPNTLVWZ1RST1V: { + for (unsigned i = 3; i > 0; --i) + MI.removeOperand(i); + unsigned Opc; + switch (Opcode) { + case X86::PT2RPNTLVWZ0RSV: + Opc = X86::T2RPNTLVWZ0RS; + break; + case X86::PT2RPNTLVWZ0RST1V: + Opc = X86::T2RPNTLVWZ0RST1; + break; + case X86::PT2RPNTLVWZ1RSV: + Opc = X86::T2RPNTLVWZ1RS; + break; + case X86::PT2RPNTLVWZ1RST1V: + Opc = X86::T2RPNTLVWZ1RST1; + break; + default: + llvm_unreachable("Impossible Opcode!"); + } + MI.setDesc(TII->get(Opc)); + return true; + } case X86::PTTRANSPOSEDV: { for (int i = 2; i > 0; --i) MI.removeOperand(i); diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index aea86c280e2f9..b2b18c1cf4557 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -338,6 +338,10 @@ namespace { case X86::PT2RPNTLVWZ0T1V: case X86::PT2RPNTLVWZ1V: case X86::PT2RPNTLVWZ1T1V: + case X86::PT2RPNTLVWZ0RSV: + case X86::PT2RPNTLVWZ0RST1V: + case X86::PT2RPNTLVWZ1RSV: + case X86::PT2RPNTLVWZ1RST1V: return true; } for (unsigned Idx = 0, E = N->getNumValues(); Idx != E; ++Idx) { @@ -5189,6 +5193,33 @@ void X86DAGToDAGISel::Select(SDNode *Node) { ReplaceNode(Node, CNode); return; } + case Intrinsic::x86_tileloaddrs64_internal: + case Intrinsic::x86_tileloaddrst164_internal: { + if (!Subtarget->hasAMXMOVRS()) + break; + unsigned Opc = IntNo == Intrinsic::x86_tileloaddrs64_internal + ? X86::PTILELOADDRSV + : X86::PTILELOADDRST1V; + // _tile_loadd_internal(row, col, buf, STRIDE) + SDValue Base = Node->getOperand(4); + SDValue Scale = getI8Imm(1, dl); + SDValue Index = Node->getOperand(5); + SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); + SDValue Segment = CurDAG->getRegister(0, MVT::i16); + SDValue Chain = Node->getOperand(0); + MachineSDNode *CNode; + SDValue Ops[] = {Node->getOperand(2), + Node->getOperand(3), + Base, + Scale, + Index, + Disp, + Segment, + Chain}; + CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops); + ReplaceNode(Node, CNode); + return; + } } break; } @@ -5307,6 +5338,44 @@ void X86DAGToDAGISel::Select(SDNode *Node) { ReplaceNode(Node, CNode); return; } + case Intrinsic::x86_tileloaddrs64: + case Intrinsic::x86_tileloaddrst164: { + if (!Subtarget->hasAMXMOVRS()) + break; + auto *MFI = + CurDAG->getMachineFunction().getInfo(); + MFI->setAMXProgModel(AMXProgModelEnum::DirectReg); + unsigned Opc; + switch (IntNo) { + default: + llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::x86_tileloaddrs64: + Opc = X86::PTILELOADDRS; + break; + case Intrinsic::x86_tileloaddrst164: + Opc = X86::PTILELOADDRST1; + break; + } + // FIXME: Match displacement and scale. + unsigned TIndex = Node->getConstantOperandVal(2); + SDValue TReg = getI8Imm(TIndex, dl); + SDValue Base = Node->getOperand(3); + SDValue Scale = getI8Imm(1, dl); + SDValue Index = Node->getOperand(4); + SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); + SDValue Segment = CurDAG->getRegister(0, MVT::i16); + SDValue Chain = Node->getOperand(0); + MachineSDNode *CNode; + if (Opc == X86::PTILESTORED) { + SDValue Ops[] = {Base, Scale, Index, Disp, Segment, TReg, Chain}; + CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); + } else { + SDValue Ops[] = {TReg, Base, Scale, Index, Disp, Segment, Chain}; + CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); + } + ReplaceNode(Node, CNode); + return; + } case Intrinsic::x86_t2rpntlvwz0: case Intrinsic::x86_t2rpntlvwz0t1: case Intrinsic::x86_t2rpntlvwz1: @@ -5342,9 +5411,45 @@ void X86DAGToDAGISel::Select(SDNode *Node) { SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); SDValue Segment = CurDAG->getRegister(0, MVT::i16); SDValue Chain = Node->getOperand(0); - MachineSDNode *CNode; SDValue Ops[] = {TReg, Base, Scale, Index, Disp, Segment, Chain}; - CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); + MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); + ReplaceNode(Node, CNode); + return; + } + case Intrinsic::x86_t2rpntlvwz0rs: + case Intrinsic::x86_t2rpntlvwz0rst1: + case Intrinsic::x86_t2rpntlvwz1rs: + case Intrinsic::x86_t2rpntlvwz1rst1: { + if (!Subtarget->hasAMXTRANSPOSE() || !Subtarget->hasAMXMOVRS()) + break; + unsigned Opc; + switch (IntNo) { + default: + llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::x86_t2rpntlvwz0rs: + Opc = X86::PT2RPNTLVWZ0RS; + break; + case Intrinsic::x86_t2rpntlvwz0rst1: + Opc = X86::PT2RPNTLVWZ0RST1; + break; + case Intrinsic::x86_t2rpntlvwz1rs: + Opc = X86::PT2RPNTLVWZ1RS; + break; + case Intrinsic::x86_t2rpntlvwz1rst1: + Opc = X86::PT2RPNTLVWZ1RST1; + break; + } + // FIXME: Match displacement and scale. + unsigned TIndex = Node->getConstantOperandVal(2); + SDValue TReg = getI8Imm(TIndex, dl); + SDValue Base = Node->getOperand(3); + SDValue Scale = getI8Imm(1, dl); + SDValue Index = Node->getOperand(4); + SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); + SDValue Segment = CurDAG->getRegister(0, MVT::i16); + SDValue Chain = Node->getOperand(0); + SDValue Ops[] = {TReg, Base, Scale, Index, Disp, Segment, Chain}; + MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); ReplaceNode(Node, CNode); return; } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 0ae814d0ca3bb..e9a3b0b675564 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -27291,6 +27291,13 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC, Operation.getValue(1)); } + case Intrinsic::x86_t2rpntlvwz0rs_internal: + case Intrinsic::x86_t2rpntlvwz0rst1_internal: + case Intrinsic::x86_t2rpntlvwz1rs_internal: + case Intrinsic::x86_t2rpntlvwz1rst1_internal: + if (!Subtarget.hasAMXTRANSPOSE() || !Subtarget.hasAMXMOVRS()) + break; + [[fallthrough]]; case Intrinsic::x86_t2rpntlvwz0_internal: case Intrinsic::x86_t2rpntlvwz0t1_internal: case Intrinsic::x86_t2rpntlvwz1_internal: @@ -27316,6 +27323,18 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, case Intrinsic::x86_t2rpntlvwz1t1_internal: Opc = X86::PT2RPNTLVWZ1T1V; break; + case Intrinsic::x86_t2rpntlvwz0rs_internal: + Opc = X86::PT2RPNTLVWZ0RSV; + break; + case Intrinsic::x86_t2rpntlvwz0rst1_internal: + Opc = X86::PT2RPNTLVWZ0RST1V; + break; + case Intrinsic::x86_t2rpntlvwz1rs_internal: + Opc = X86::PT2RPNTLVWZ1RSV; + break; + case Intrinsic::x86_t2rpntlvwz1rst1_internal: + Opc = X86::PT2RPNTLVWZ1RST1V; + break; } SDLoc DL(Op); @@ -37553,6 +37572,35 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MI.eraseFromParent(); // The pseudo is gone now. return BB; } + case X86::PTILELOADDRS: + case X86::PTILELOADDRST1: { + unsigned Opc; + switch (MI.getOpcode()) { + default: + llvm_unreachable("illegal opcode!"); + case X86::PTILELOADDRS: + Opc = X86::TILELOADDRS; + break; + case X86::PTILELOADDRST1: + Opc = X86::TILELOADDRST1; + break; + } + MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc)); + unsigned CurOp = 0; + if (Opc != X86::TILESTORED) + MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()), + RegState::Define); + MIB.add(MI.getOperand(CurOp++)); // base + MIB.add(MI.getOperand(CurOp++)); // scale + MIB.add(MI.getOperand(CurOp++)); // index -- stride + MIB.add(MI.getOperand(CurOp++)); // displacement + MIB.add(MI.getOperand(CurOp++)); // segment + if (Opc == X86::TILESTORED) + MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()), + RegState::Undef); + MI.eraseFromParent(); // The pseudo is gone now. + return BB; + } case X86::PTCMMIMFP16PS: case X86::PTCMMRLFP16PS: { const MIMetadata MIMD(MI); @@ -37605,6 +37653,39 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MI.eraseFromParent(); // The pseudo is gone now. return BB; } + case X86::PT2RPNTLVWZ0RS: + case X86::PT2RPNTLVWZ0RST1: + case X86::PT2RPNTLVWZ1RS: + case X86::PT2RPNTLVWZ1RST1: { + const DebugLoc &DL = MI.getDebugLoc(); + unsigned Opc; + switch (MI.getOpcode()) { + default: + llvm_unreachable("Unexpected instruction!"); + case X86::PT2RPNTLVWZ0RS: + Opc = X86::T2RPNTLVWZ0RS; + break; + case X86::PT2RPNTLVWZ0RST1: + Opc = X86::T2RPNTLVWZ0RST1; + break; + case X86::PT2RPNTLVWZ1RS: + Opc = X86::T2RPNTLVWZ1RS; + break; + case X86::PT2RPNTLVWZ1RST1: + Opc = X86::T2RPNTLVWZ1RST1; + break; + } + MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc)); + MIB.addReg(TMMImmToTMMPair(MI.getOperand(0).getImm()), RegState::Define); + + MIB.add(MI.getOperand(1)); // base + MIB.add(MI.getOperand(2)); // scale + MIB.add(MI.getOperand(3)); // index + MIB.add(MI.getOperand(4)); // displacement + MIB.add(MI.getOperand(5)); // segment + MI.eraseFromParent(); // The pseudo is gone now. + return BB; + } case X86::PTTRANSPOSED: { const DebugLoc &DL = MI.getDebugLoc(); diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td index 947a8bec2890e..efd396cd2bc43 100644 --- a/llvm/lib/Target/X86/X86InstrAMX.td +++ b/llvm/lib/Target/X86/X86InstrAMX.td @@ -369,3 +369,94 @@ let Predicates = [HasAMXTRANSPOSE, In64BitMode] in { } } } // HasAMXTILE, HasAMXTRANSPOSE + +let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in { + def T2RPNTLVWZ0RS : I<0xf8, MRMSrcMemFSIB, (outs TILEPair:$dst), + (ins sibmem:$src1), + "t2rpntlvwz0rs\t{$src1, $dst|$dst, $src1}", + []>, VEX, T_MAP5; + def T2RPNTLVWZ0RST1 : I<0xf9, MRMSrcMemFSIB, (outs TILEPair:$dst), + (ins sibmem:$src1), + "t2rpntlvwz0rst1\t{$src1, $dst|$dst, $src1}", + []>, VEX, T_MAP5; + def T2RPNTLVWZ1RS : I<0xf8, MRMSrcMemFSIB, (outs TILEPair:$dst), + (ins sibmem:$src1), + "t2rpntlvwz1rs\t{$src1, $dst|$dst, $src1}", + []>, VEX, T_MAP5, PD; + def T2RPNTLVWZ1RST1 : I<0xf9, MRMSrcMemFSIB, (outs TILEPair:$dst), + (ins sibmem:$src1), + "t2rpntlvwz1rst1\t{$src1, $dst|$dst, $src1}", + []>, VEX, T_MAP5, PD; + let isPseudo = true in { + def PT2RPNTLVWZ0RSV : PseudoI<(outs TILEPair:$dst), + (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), + []>; + def PT2RPNTLVWZ0RST1V : PseudoI<(outs TILEPair:$dst), + (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), + []>; + def PT2RPNTLVWZ1RSV : PseudoI<(outs TILEPair:$dst), + (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), + []>; + def PT2RPNTLVWZ1RST1V : PseudoI<(outs TILEPair:$dst), + (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), + []>; + } + let usesCustomInserter = 1 in { + def PT2RPNTLVWZ0RS : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>; + def PT2RPNTLVWZ0RST1 : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>; + def PT2RPNTLVWZ1RS : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>; + def PT2RPNTLVWZ1RST1 : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>; + } +} // HasAMXMOVRS, HasAMXTRANSPOSE + +let Predicates = [HasAMXMOVRS, In64BitMode], SchedRW = [WriteSystem] in { + def TILELOADDRS : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst), + (ins sibmem:$src1), + "tileloaddrs\t{$src1, $dst|$dst, $src1}", + []>, VEX, T8, XD; + def TILELOADDRST1 : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst), + (ins sibmem:$src1), + "tileloaddrst1\t{$src1, $dst|$dst, $src1}", + []>, VEX, T8, PD; + + let isPseudo = true, mayLoad = 1 in + def PTILELOADDRSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, + GR16:$src2, + opaquemem:$src3), []>; + let isPseudo = true, mayLoad = 1 in + def PTILELOADDRST1V : PseudoI<(outs TILE:$dst), (ins GR16:$src1, + GR16:$src2, + opaquemem:$src3), []>; + let usesCustomInserter = 1 in { + let mayLoad = 1 in + def PTILELOADDRS : PseudoI<(outs), (ins u8imm:$src1, sibmem:$src2), []>; + let mayLoad = 1 in + def PTILELOADDRST1 : PseudoI<(outs), (ins u8imm:$src1, sibmem:$src2), []>; + } + + def TILELOADDRSrm_EVEX : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst), + (ins sibmem:$src1), + "tileloaddrs\t{$src1, $dst|$dst, $src1}", + []>, EVEX, NoCD8, T8, XD; + def TILELOADDRST1rm_EVEX : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst), + (ins sibmem:$src1), + "tileloaddrst1\t{$src1, $dst|$dst, $src1}", + []>, EVEX, NoCD8, T8, PD; + + def T2RPNTLVWZ0RS_EVEX : I<0xf8, MRMSrcMemFSIB, (outs TILEPair:$dst), + (ins sibmem:$src1), + "t2rpntlvwz0rs\t{$src1, $dst|$dst, $src1}", + []>, EVEX, NoCD8, T_MAP5; + def T2RPNTLVWZ0RST1_EVEX : I<0xf9, MRMSrcMemFSIB, (outs TILEPair:$dst), + (ins sibmem:$src1), + "t2rpntlvwz0rst1\t{$src1, $dst|$dst, $src1}", + []>, EVEX, NoCD8, T_MAP5; + def T2RPNTLVWZ1RS_EVEX : I<0xf8, MRMSrcMemFSIB, (outs TILEPair:$dst), + (ins sibmem:$src1), + "t2rpntlvwz1rs\t{$src1, $dst|$dst, $src1}", + []>, EVEX, NoCD8, T_MAP5, PD; + def T2RPNTLVWZ1RST1_EVEX : I<0xf9, MRMSrcMemFSIB, (outs TILEPair:$dst), + (ins sibmem:$src1), + "t2rpntlvwz1rst1\t{$src1, $dst|$dst, $src1}", + []>, EVEX, NoCD8, T_MAP5, PD; +} // HasAMXMOVRS, In64BitMode diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 9b002ebd3a93b..41bece5e2cec4 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -4738,6 +4738,7 @@ static bool isAMXOpcode(unsigned Opc) { case X86::TILELOADD_EVEX: case X86::TILESTORED_EVEX: case X86::PTILEPAIRLOAD: + case X86::TILELOADDRS: return true; } } diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td index d22e7dadaaa26..7a31e4212670b 100644 --- a/llvm/lib/Target/X86/X86InstrPredicates.td +++ b/llvm/lib/Target/X86/X86InstrPredicates.td @@ -184,6 +184,7 @@ def HasAMXBF16 : Predicate<"Subtarget->hasAMXBF16()">; def HasAMXINT8 : Predicate<"Subtarget->hasAMXINT8()">; def HasAMXCOMPLEX : Predicate<"Subtarget->hasAMXCOMPLEX()">; def HasAMXFP8 : Predicate<"Subtarget->hasAMXFP8()">; +def HasAMXMOVRS : Predicate<"Subtarget->hasAMXMOVRS()">; def HasAMXTRANSPOSE : Predicate<"Subtarget->hasAMXTRANSPOSE()">; def HasUINTR : Predicate<"Subtarget->hasUINTR()">; def HasUSERMSR : Predicate<"Subtarget->hasUSERMSR()">; diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp index 688e886cf3b13..c5c0f7a03b33a 100644 --- a/llvm/lib/Target/X86/X86LowerAMXType.cpp +++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp @@ -229,7 +229,13 @@ std::pair ShapeCalculator::getShape(IntrinsicInst *II, case Intrinsic::x86_t2rpntlvwz1t1_internal: case Intrinsic::x86_tileloadd64_internal: case Intrinsic::x86_tileloaddt164_internal: - case Intrinsic::x86_tilestored64_internal: { + case Intrinsic::x86_tilestored64_internal: + case Intrinsic::x86_t2rpntlvwz0rs_internal: + case Intrinsic::x86_t2rpntlvwz0rst1_internal: + case Intrinsic::x86_t2rpntlvwz1rs_internal: + case Intrinsic::x86_t2rpntlvwz1rst1_internal: + case Intrinsic::x86_tileloaddrs64_internal: + case Intrinsic::x86_tileloaddrst164_internal: { Row = II->getArgOperand(0); Col = II->getArgOperand(1); break; diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp index 2daaa95b06be0..6c7fc4bd49e80 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -1078,7 +1078,9 @@ static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM, case X86::PTDPFP16PSV: case X86::PTCMMIMFP16PSV: case X86::PTCMMRLFP16PSV: - case X86::PTTRANSPOSEDV: { + case X86::PTTRANSPOSEDV: + case X86::PTILELOADDRSV: + case X86::PTILELOADDRST1V: { MachineOperand &MO1 = MI->getOperand(1); MachineOperand &MO2 = MI->getOperand(2); ShapeT Shape(&MO1, &MO2, MRI); @@ -1088,7 +1090,11 @@ static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM, case X86::PT2RPNTLVWZ0V: case X86::PT2RPNTLVWZ0T1V: case X86::PT2RPNTLVWZ1V: - case X86::PT2RPNTLVWZ1T1V: { + case X86::PT2RPNTLVWZ1T1V: + case X86::PT2RPNTLVWZ0RSV: + case X86::PT2RPNTLVWZ0RST1V: + case X86::PT2RPNTLVWZ1RSV: + case X86::PT2RPNTLVWZ1RST1V: { MachineOperand &MO1 = MI->getOperand(1); MachineOperand &MO2 = MI->getOperand(2); MachineOperand &MO3 = MI->getOperand(3); diff --git a/llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll b/llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll new file mode 100755 index 0000000000000..da212a1850964 --- /dev/null +++ b/llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll @@ -0,0 +1,108 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-movrs | FileCheck %s + +define void @test_amx_internal(i16 %m, i16 %n, ptr %buf, i64 %s) { +; CHECK-LABEL: test_amx_internal: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: andq $-1024, %rsp # imm = 0xFC00 +; CHECK-NEXT: subq $3072, %rsp # imm = 0xC00 +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; CHECK-NEXT: movw %ax, %cx +; CHECK-NEXT: movw %di, %ax +; CHECK-NEXT: # implicit-def: $al +; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: tileloaddrs (%rdx,%rsi), %tmm0 +; CHECK-NEXT: movl $64, %esi +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; CHECK-NEXT: tilestored %tmm0, (%rdx,%rsi) +; CHECK-NEXT: movq %rbp, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: tilerelease +; CHECK-NEXT: retq +entry: + %t1 = call x86_amx @llvm.x86.tileloaddrs64.internal(i16 %m, i16 %n, ptr %buf, i64 %s) + %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1) + ret void +} +declare x86_amx @llvm.x86.tileloaddrs64.internal(i16, i16, ptr, i64) + +define void @test_amx_old(i16 %m, i16 %n, ptr %buf) { +; CHECK-LABEL: test_amx_old: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl $32, %eax +; CHECK-NEXT: tileloaddrs (%rdx,%rax), %tmm2 +; CHECK-NEXT: retq +entry: + call void @llvm.x86.tileloaddrs64(i8 2, ptr %buf, i64 32) + ret void +} +declare void @llvm.x86.tileloaddrs64(i8 immarg, ptr, i64) + +define void @test_amx_t1_internal(i16 %m, i16 %n, ptr %buf, i64 %s) { +; CHECK-LABEL: test_amx_t1_internal: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: andq $-1024, %rsp # imm = 0xFC00 +; CHECK-NEXT: subq $3072, %rsp # imm = 0xC00 +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; CHECK-NEXT: movw %ax, %cx +; CHECK-NEXT: movw %di, %ax +; CHECK-NEXT: # implicit-def: $al +; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: tileloaddrst1 (%rdx,%rsi), %tmm0 +; CHECK-NEXT: movl $64, %esi +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; CHECK-NEXT: tilestored %tmm0, (%rdx,%rsi) +; CHECK-NEXT: movq %rbp, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: tilerelease +; CHECK-NEXT: retq +entry: + %t1 = call x86_amx @llvm.x86.tileloaddrst164.internal(i16 %m, i16 %n, ptr %buf, i64 %s) + %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1) + ret void +} +declare x86_amx @llvm.x86.tileloaddrst164.internal(i16, i16, ptr, i64) + +define void @test_amx_t1_old(i16 %m, i16 %n, ptr %buf) { +; CHECK-LABEL: test_amx_t1_old: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl $32, %eax +; CHECK-NEXT: tileloaddrst1 (%rdx,%rax), %tmm2 +; CHECK-NEXT: retq +entry: + call void @llvm.x86.tileloaddrst164(i8 2, ptr %buf, i64 32) + ret void +} +declare void @llvm.x86.tileloaddrst164(i8 immarg, ptr, i64) diff --git a/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll b/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll new file mode 100755 index 0000000000000..146b69773eb18 --- /dev/null +++ b/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll @@ -0,0 +1,92 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs | FileCheck %s --check-prefixes=CHECK,O0 +; RUN: llc < %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs | FileCheck %s --check-prefixes=CHECK,O2 + +define void @test_amx(i64 %stride, i8* %addr1) #0 { +; CHECK-LABEL: test_amx: +; CHECK: # %bb.0: +; CHECK-NEXT: t2rpntlvwz0rs (%rsi,%rdi), %tmm0 +; CHECK-NEXT: t2rpntlvwz0rst1 (%rsi,%rdi), %tmm2 +; CHECK-NEXT: t2rpntlvwz1rs (%rsi,%rdi), %tmm0 +; CHECK-NEXT: t2rpntlvwz1rst1 (%rsi,%rdi), %tmm2 +; CHECK-NEXT: retq + call void @llvm.x86.t2rpntlvwz0rs(i8 1, i8* %addr1, i64 %stride) + call void @llvm.x86.t2rpntlvwz0rst1(i8 2, i8* %addr1, i64 %stride) + call void @llvm.x86.t2rpntlvwz1rs(i8 1, i8* %addr1, i64 %stride) + call void @llvm.x86.t2rpntlvwz1rst1(i8 2, i8* %addr1, i64 %stride) + ret void +} +declare void @llvm.x86.t2rpntlvwz0rs(i8 , i8* , i64 ) +declare void @llvm.x86.t2rpntlvwz0rst1(i8 , i8* , i64 ) +declare void @llvm.x86.t2rpntlvwz1rs(i8 , i8* , i64 ) +declare void @llvm.x86.t2rpntlvwz1rst1(i8 , i8* , i64 ) + +define void @test_amx2(i8* %base, i64 %stride) #0 { +; O0-LABEL: test_amx2: +; O0: # %bb.0: +; O0-NEXT: xorps %xmm0, %xmm0 +; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; O0-NEXT: movb $1, -{{[0-9]+}}(%rsp) +; O0-NEXT: movw $8, %ax +; O0-NEXT: # implicit-def: $al +; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; O0-NEXT: # implicit-def: $al +; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; O0-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) +; O0-NEXT: t2rpntlvwz0rst1 (%rdi,%rsi), %tmm4 +; O0-NEXT: movw $8, %ax +; O0-NEXT: # implicit-def: $al +; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; O0-NEXT: # implicit-def: $al +; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; O0-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) +; O0-NEXT: t2rpntlvwz1rs (%rdi,%rsi), %tmm4 +; O0-NEXT: movw $8, %ax +; O0-NEXT: # implicit-def: $al +; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; O0-NEXT: # implicit-def: $al +; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; O0-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) +; O0-NEXT: t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4 +; O0-NEXT: tilerelease +; O0-NEXT: retq +; +; O2-LABEL: test_amx2: +; O2: # %bb.0: +; O2-NEXT: xorps %xmm0, %xmm0 +; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; O2-NEXT: movb $1, -{{[0-9]+}}(%rsp) +; O2-NEXT: movb $8, -{{[0-9]+}}(%rsp) +; O2-NEXT: movw $8, -{{[0-9]+}}(%rsp) +; O2-NEXT: movb $8, -{{[0-9]+}}(%rsp) +; O2-NEXT: movw $8, -{{[0-9]+}}(%rsp) +; O2-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) +; O2-NEXT: movw $8, %ax +; O2-NEXT: t2rpntlvwz0rs (%rdi,%rsi), %tmm4 +; O2-NEXT: t2rpntlvwz0rst1 (%rdi,%rsi), %tmm4 +; O2-NEXT: t2rpntlvwz1rs (%rdi,%rsi), %tmm4 +; O2-NEXT: t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4 +; O2-NEXT: tilerelease +; O2-NEXT: retq + call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) + call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) + call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) + call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) + ret void +} +declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16, i16, i16, i8*, i64) +declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16, i16, i16, i8*, i64) +declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16, i16, i16, i8*, i64) +declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal(i16, i16, i16, i8*, i64) diff --git a/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt new file mode 100755 index 0000000000000..6df44c87d2332 --- /dev/null +++ b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt @@ -0,0 +1,98 @@ +# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s -check-prefix=ATT +# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s -check-prefix=INTEL + +# ATT: t2rpntlvwz0rs 268435456(%rbp,%r14,8), %tmm6 +# INTEL: t2rpntlvwz0rs tmm6, [rbp + 8*r14 + 268435456] +0xc4,0xa5,0x78,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: t2rpntlvwz0rs 291(%r8,%rax,4), %tmm2 +# INTEL: t2rpntlvwz0rs tmm2, [r8 + 4*rax + 291] +0xc4,0xc5,0x78,0xf8,0x94,0x80,0x23,0x01,0x00,0x00 + +# ATT: t2rpntlvwz0rs 64(%rbx), %tmm6 +# INTEL: t2rpntlvwz0rs tmm6, [rbx + 64] +0xc4,0xe5,0x78,0xf8,0x74,0x23,0x40 + +# ATT: t2rpntlvwz0rs -32(,%rbp,2), %tmm2 +# INTEL: t2rpntlvwz0rs tmm2, [2*rbp - 32] +0xc4,0xe5,0x78,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff + +# ATT: t2rpntlvwz0rst1 268435456(%rbp,%r14,8), %tmm6 +# INTEL: t2rpntlvwz0rst1 tmm6, [rbp + 8*r14 + 268435456] +0xc4,0xa5,0x78,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: t2rpntlvwz0rst1 291(%r8,%rax,4), %tmm2 +# INTEL: t2rpntlvwz0rst1 tmm2, [r8 + 4*rax + 291] +0xc4,0xc5,0x78,0xf9,0x94,0x80,0x23,0x01,0x00,0x00 + +# ATT: t2rpntlvwz0rst1 64(%rbx), %tmm6 +# INTEL: t2rpntlvwz0rst1 tmm6, [rbx + 64] +0xc4,0xe5,0x78,0xf9,0x74,0x23,0x40 + +# ATT: t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2 +# INTEL: t2rpntlvwz0rst1 tmm2, [2*rbp - 32] +0xc4,0xe5,0x78,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff + +# ATT: t2rpntlvwz1rs 268435456(%rbp,%r14,8), %tmm6 +# INTEL: t2rpntlvwz1rs tmm6, [rbp + 8*r14 + 268435456] +0xc4,0xa5,0x79,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: t2rpntlvwz1rs 291(%r8,%rax,4), %tmm2 +# INTEL: t2rpntlvwz1rs tmm2, [r8 + 4*rax + 291] +0xc4,0xc5,0x79,0xf8,0x94,0x80,0x23,0x01,0x00,0x00 + +# ATT: t2rpntlvwz1rs 64(%rbx), %tmm6 +# INTEL: t2rpntlvwz1rs tmm6, [rbx + 64] +0xc4,0xe5,0x79,0xf8,0x74,0x23,0x40 + +# ATT: t2rpntlvwz1rs -32(,%rbp,2), %tmm2 +# INTEL: t2rpntlvwz1rs tmm2, [2*rbp - 32] +0xc4,0xe5,0x79,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff + +# ATT: t2rpntlvwz1rst1 268435456(%rbp,%r14,8), %tmm6 +# INTEL: t2rpntlvwz1rst1 tmm6, [rbp + 8*r14 + 268435456] +0xc4,0xa5,0x79,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: t2rpntlvwz1rst1 291(%r8,%rax,4), %tmm2 +# INTEL: t2rpntlvwz1rst1 tmm2, [r8 + 4*rax + 291] +0xc4,0xc5,0x79,0xf9,0x94,0x80,0x23,0x01,0x00,0x00 + +# ATT: t2rpntlvwz1rst1 64(%rbx), %tmm6 +# INTEL: t2rpntlvwz1rst1 tmm6, [rbx + 64] +0xc4,0xe5,0x79,0xf9,0x74,0x23,0x40 + +# ATT: t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2 +# INTEL: t2rpntlvwz1rst1 tmm2, [2*rbp - 32] +0xc4,0xe5,0x79,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff + +# ATT: tileloaddrs 268435456(%rbp,%r14,8), %tmm6 +# INTEL: tileloaddrs tmm6, [rbp + 8*r14 + 268435456] +0xc4,0xa2,0x7b,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: tileloaddrs 291(%r8,%rax,4), %tmm3 +# INTEL: tileloaddrs tmm3, [r8 + 4*rax + 291] +0xc4,0xc2,0x7b,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00 + +# ATT: tileloaddrs 64(%rbx), %tmm6 +# INTEL: tileloaddrs tmm6, [rbx + 64] +0xc4,0xe2,0x7b,0x4a,0x74,0x23,0x40 + +# ATT: tileloaddrs -32(,%rbp,2), %tmm3 +# INTEL: tileloaddrs tmm3, [2*rbp - 32] +0xc4,0xe2,0x7b,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff + +# ATT: tileloaddrst1 268435456(%rbp,%r14,8), %tmm6 +# INTEL: tileloaddrst1 tmm6, [rbp + 8*r14 + 268435456] +0xc4,0xa2,0x79,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: tileloaddrst1 291(%r8,%rax,4), %tmm3 +# INTEL: tileloaddrst1 tmm3, [r8 + 4*rax + 291] +0xc4,0xc2,0x79,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00 + +# ATT: tileloaddrst1 64(%rbx), %tmm6 +# INTEL: tileloaddrst1 tmm6, [rbx + 64] +0xc4,0xe2,0x79,0x4a,0x74,0x23,0x40 + +# ATT: tileloaddrst1 -32(,%rbp,2), %tmm3 +# INTEL: tileloaddrst1 tmm3, [2*rbp - 32] +0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s new file mode 100755 index 0000000000000..d780ad4f0e369 --- /dev/null +++ b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s @@ -0,0 +1,89 @@ +// RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding %s | FileCheck %s + +// CHECK: t2rpntlvwz0rs 268435456(%rbp,%r14,8), %tmm6 +// CHECK: encoding: [0xc4,0xa5,0x78,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10] + t2rpntlvwz0rs 268435456(%rbp,%r14,8), %tmm6 + +// CHECK: t2rpntlvwz0rs 291(%r8,%rax,4), %tmm2 +// CHECK: encoding: [0xc4,0xc5,0x78,0xf8,0x94,0x80,0x23,0x01,0x00,0x00] + t2rpntlvwz0rs 291(%r8,%rax,4), %tmm2 + +// CHECK: t2rpntlvwz0rs 64(%rbx), %tmm6 +// CHECK: encoding: [0xc4,0xe5,0x78,0xf8,0x74,0x23,0x40] + t2rpntlvwz0rs 64(%rbx), %tmm6 + +// CHECK: t2rpntlvwz0rs -32(,%rbp,2), %tmm2 +// CHECK: encoding: [0xc4,0xe5,0x78,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff] + t2rpntlvwz0rs -32(,%rbp,2), %tmm2 + +// CHECK: t2rpntlvwz0rst1 268435456(%rbp,%r14,8), %tmm6 +// CHECK: encoding: [0xc4,0xa5,0x78,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10] + t2rpntlvwz0rst1 268435456(%rbp,%r14,8), %tmm6 + +// CHECK: t2rpntlvwz0rst1 291(%r8,%rax,4), %tmm2 +// CHECK: encoding: [0xc4,0xc5,0x78,0xf9,0x94,0x80,0x23,0x01,0x00,0x00] + t2rpntlvwz0rst1 291(%r8,%rax,4), %tmm2 + +// CHECK: t2rpntlvwz0rst1 64(%rbx), %tmm6 +// CHECK: encoding: [0xc4,0xe5,0x78,0xf9,0x74,0x23,0x40] + t2rpntlvwz0rst1 64(%rbx), %tmm6 + +// CHECK: t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2 +// CHECK: encoding: [0xc4,0xe5,0x78,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff] + t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2 + +// CHECK: t2rpntlvwz1rs 268435456(%rbp,%r14,8), %tmm6 +// CHECK: encoding: [0xc4,0xa5,0x79,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10] + t2rpntlvwz1rs 268435456(%rbp,%r14,8), %tmm6 + +// CHECK: t2rpntlvwz1rs 291(%r8,%rax,4), %tmm2 +// CHECK: encoding: [0xc4,0xc5,0x79,0xf8,0x94,0x80,0x23,0x01,0x00,0x00] + t2rpntlvwz1rs 291(%r8,%rax,4), %tmm2 + +// CHECK: t2rpntlvwz1rs 64(%rbx), %tmm6 +// CHECK: encoding: [0xc4,0xe5,0x79,0xf8,0x74,0x23,0x40] + t2rpntlvwz1rs 64(%rbx), %tmm6 + +// CHECK: t2rpntlvwz1rs -32(,%rbp,2), %tmm2 +// CHECK: encoding: [0xc4,0xe5,0x79,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff] + t2rpntlvwz1rs -32(,%rbp,2), %tmm2 + +// CHECK: t2rpntlvwz1rst1 268435456(%rbp,%r14,8), %tmm6 +// CHECK: encoding: [0xc4,0xa5,0x79,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10] + t2rpntlvwz1rst1 268435456(%rbp,%r14,8), %tmm6 + +// CHECK: t2rpntlvwz1rst1 291(%r8,%rax,4), %tmm2 +// CHECK: encoding: [0xc4,0xc5,0x79,0xf9,0x94,0x80,0x23,0x01,0x00,0x00] + t2rpntlvwz1rst1 291(%r8,%rax,4), %tmm2 + +// CHECK: t2rpntlvwz1rst1 64(%rbx), %tmm6 +// CHECK: encoding: [0xc4,0xe5,0x79,0xf9,0x74,0x23,0x40] + t2rpntlvwz1rst1 64(%rbx), %tmm6 + +// CHECK: t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2 +// CHECK: encoding: [0xc4,0xe5,0x79,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff] + t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2 + +// CHECK: tileloaddrs 268435456(%rbp,%r14,8), %tmm6 +// CHECK: encoding: [0xc4,0xa2,0x7b,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10] + tileloaddrs 268435456(%rbp,%r14,8), %tmm6 + +// CHECK: tileloaddrs 291(%r8,%rax,4), %tmm3 +// CHECK: encoding: [0xc4,0xc2,0x7b,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00] + tileloaddrs 291(%r8,%rax,4), %tmm3 + +// CHECK: tileloaddrs -32(,%rbp,2), %tmm3 +// CHECK: encoding: [0xc4,0xe2,0x7b,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff] + tileloaddrs -32(,%rbp,2), %tmm3 + +// CHECK: tileloaddrst1 268435456(%rbp,%r14,8), %tmm6 +// CHECK: encoding: [0xc4,0xa2,0x79,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10] + tileloaddrst1 268435456(%rbp,%r14,8), %tmm6 + +// CHECK: tileloaddrst1 291(%r8,%rax,4), %tmm3 +// CHECK: encoding: [0xc4,0xc2,0x79,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00] + tileloaddrst1 291(%r8,%rax,4), %tmm3 + +// CHECK: tileloaddrst1 -32(,%rbp,2), %tmm3 +// CHECK: encoding: [0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff] + tileloaddrst1 -32(,%rbp,2), %tmm3 \ No newline at end of file diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s new file mode 100755 index 0000000000000..ccc7ac51a98a4 --- /dev/null +++ b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s @@ -0,0 +1,97 @@ +// RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s + +// CHECK: t2rpntlvwz0rs tmm6, [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0xa5,0x78,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10] + t2rpntlvwz0rs tmm6, [rbp + 8*r14 + 268435456] + +// CHECK: t2rpntlvwz0rs tmm2, [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0xc5,0x78,0xf8,0x94,0x80,0x23,0x01,0x00,0x00] + t2rpntlvwz0rs tmm2, [r8 + 4*rax + 291] + +// CHECK: t2rpntlvwz0rs tmm6, [rbx + 64] +// CHECK: encoding: [0xc4,0xe5,0x78,0xf8,0x74,0x23,0x40] + t2rpntlvwz0rs tmm6, [rbx + 64] + +// CHECK: t2rpntlvwz0rs tmm2, [2*rbp - 32] +// CHECK: encoding: [0xc4,0xe5,0x78,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff] + t2rpntlvwz0rs tmm2, [2*rbp - 32] + +// CHECK: t2rpntlvwz0rst1 tmm6, [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0xa5,0x78,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10] + t2rpntlvwz0rst1 tmm6, [rbp + 8*r14 + 268435456] + +// CHECK: t2rpntlvwz0rst1 tmm2, [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0xc5,0x78,0xf9,0x94,0x80,0x23,0x01,0x00,0x00] + t2rpntlvwz0rst1 tmm2, [r8 + 4*rax + 291] + +// CHECK: t2rpntlvwz0rst1 tmm6, [rbx + 64] +// CHECK: encoding: [0xc4,0xe5,0x78,0xf9,0x74,0x23,0x40] + t2rpntlvwz0rst1 tmm6, [rbx + 64] + +// CHECK: t2rpntlvwz0rst1 tmm2, [2*rbp - 32] +// CHECK: encoding: [0xc4,0xe5,0x78,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff] + t2rpntlvwz0rst1 tmm2, [2*rbp - 32] + +// CHECK: t2rpntlvwz1rs tmm6, [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0xa5,0x79,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10] + t2rpntlvwz1rs tmm6, [rbp + 8*r14 + 268435456] + +// CHECK: t2rpntlvwz1rs tmm2, [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0xc5,0x79,0xf8,0x94,0x80,0x23,0x01,0x00,0x00] + t2rpntlvwz1rs tmm2, [r8 + 4*rax + 291] + +// CHECK: t2rpntlvwz1rs tmm6, [rbx + 64] +// CHECK: encoding: [0xc4,0xe5,0x79,0xf8,0x74,0x23,0x40] + t2rpntlvwz1rs tmm6, [rbx + 64] + +// CHECK: t2rpntlvwz1rs tmm2, [2*rbp - 32] +// CHECK: encoding: [0xc4,0xe5,0x79,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff] + t2rpntlvwz1rs tmm2, [2*rbp - 32] + +// CHECK: t2rpntlvwz1rst1 tmm6, [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0xa5,0x79,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10] + t2rpntlvwz1rst1 tmm6, [rbp + 8*r14 + 268435456] + +// CHECK: t2rpntlvwz1rst1 tmm2, [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0xc5,0x79,0xf9,0x94,0x80,0x23,0x01,0x00,0x00] + t2rpntlvwz1rst1 tmm2, [r8 + 4*rax + 291] + +// CHECK: t2rpntlvwz1rst1 tmm6, [rbx + 64] +// CHECK: encoding: [0xc4,0xe5,0x79,0xf9,0x74,0x23,0x40] + t2rpntlvwz1rst1 tmm6, [rbx + 64] + +// CHECK: t2rpntlvwz1rst1 tmm2, [2*rbp - 32] +// CHECK: encoding: [0xc4,0xe5,0x79,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff] + t2rpntlvwz1rst1 tmm2, [2*rbp - 32] + +// CHECK: tileloaddrs tmm6, [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0xa2,0x7b,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10] + tileloaddrs tmm6, [rbp + 8*r14 + 268435456] + +// CHECK: tileloaddrs tmm3, [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0xc2,0x7b,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00] + tileloaddrs tmm3, [r8 + 4*rax + 291] + +// CHECK: tileloaddrs tmm6, [rbx + 64] +// CHECK: encoding: [0xc4,0xe2,0x7b,0x4a,0x74,0x23,0x40] + tileloaddrs tmm6, [rbx + 64] + +// CHECK: tileloaddrs tmm3, [2*rbp - 32] +// CHECK: encoding: [0xc4,0xe2,0x7b,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff] + tileloaddrs tmm3, [2*rbp - 32] + +// CHECK: tileloaddrst1 tmm6, [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0xa2,0x79,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10] + tileloaddrst1 tmm6, [rbp + 8*r14 + 268435456] + +// CHECK: tileloaddrst1 tmm3, [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0xc2,0x79,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00] + tileloaddrst1 tmm3, [r8 + 4*rax + 291] + +// CHECK: tileloaddrst1 tmm6, [rbx + 64] +// CHECK: encoding: [0xc4,0xe2,0x79,0x4a,0x74,0x23,0x40] + tileloaddrst1 tmm6, [rbx + 64] + +// CHECK: tileloaddrst1 tmm3, [2*rbp - 32] +// CHECK: encoding: [0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff] + tileloaddrst1 tmm3, [2*rbp - 32] From e7a09d7ccbbcd0ed222cdbc57236d2158306457e Mon Sep 17 00:00:00 2001 From: Malay Sanghi Date: Wed, 6 Nov 2024 18:16:51 +0800 Subject: [PATCH 2/4] update test --- clang/test/CodeGen/X86/amx_movrs_errors.c | 2 +- clang/test/CodeGen/X86/amx_movrs_transpose_errors.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/test/CodeGen/X86/amx_movrs_errors.c b/clang/test/CodeGen/X86/amx_movrs_errors.c index bac7d962f5cb5..2790126eb8672 100755 --- a/clang/test/CodeGen/X86/amx_movrs_errors.c +++ b/clang/test/CodeGen/X86/amx_movrs_errors.c @@ -1,7 +1,7 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 // RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \ // RUN: -target-feature +amx-int8 -target-feature +amx-bf16 -target-feature +amx-reduce -target-feature +amx-memory \ -// RUN: -target-feature +amx-format -target-feature +amx-element -emit-llvm -verify +// RUN: -target-feature +amx-format -target-feature +amx-element -verify #include #include diff --git a/clang/test/CodeGen/X86/amx_movrs_transpose_errors.c b/clang/test/CodeGen/X86/amx_movrs_transpose_errors.c index c8846b36ffa87..840b52bbb29bb 100755 --- a/clang/test/CodeGen/X86/amx_movrs_transpose_errors.c +++ b/clang/test/CodeGen/X86/amx_movrs_transpose_errors.c @@ -1,6 +1,6 @@ // RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \ // RUN: -target-feature +amx-int8 -target-feature +amx-transpose -target-feature +amx-movrs \ -// RUN: -emit-llvm -verify +// RUN: -verify #include #include From 193420c854b52023e1a2874f05e1e019436111ab Mon Sep 17 00:00:00 2001 From: Malay Sanghi Date: Mon, 11 Nov 2024 07:14:22 -0800 Subject: [PATCH 3/4] review --- clang/docs/ReleaseNotes.rst | 1 + clang/lib/Headers/CMakeLists.txt | 3 +- clang/lib/Headers/amxmovrstransposeintrin.h | 201 ++++++++++++++++++ clang/lib/Headers/amxtransposeintrin.h | 177 --------------- clang/lib/Headers/immintrin.h | 1 + clang/test/CodeGen/X86/amx_movrs_errors.c | 4 +- .../llvm/TargetParser/X86TargetParser.def | 1 + llvm/lib/Target/X86/X86ExpandPseudo.cpp | 37 ++-- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 127 ++++------- llvm/lib/Target/X86/X86ISelLowering.cpp | 65 ++---- llvm/lib/Target/X86/X86InstrAMX.td | 26 --- llvm/lib/Target/X86/X86InstrInfo.cpp | 1 - llvm/lib/TargetParser/Host.cpp | 1 + llvm/lib/TargetParser/X86TargetParser.cpp | 1 + 14 files changed, 272 insertions(+), 374 deletions(-) create mode 100644 clang/lib/Headers/amxmovrstransposeintrin.h diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index c3424e0e6f34c..302eb8bf3fd0b 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -739,6 +739,7 @@ X86 Support * Supported intrinsics of ``_mm(256|512)_(mask(z))_loadrs_epi(8|16|32|64)``. - Support ISA of ``AMX-FP8``. - Support ISA of ``AMX-TRANSPOSE``. +- Support ISA of ``AMX-MOVRS``. - Support ISA of ``AMX-AVX512``. Arm and AArch64 Support diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index d7119be4ef8a8..e52a6f9dd40d9 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -151,8 +151,9 @@ set(x86_files amxfp16intrin.h amxfp8intrin.h amxintrin.h - amxtransposeintrin.h amxmovrsintrin.h + amxmovrstransposeintrin.h + amxtransposeintrin.h avx10_2_512bf16intrin.h avx10_2_512convertintrin.h avx10_2_512minmaxintrin.h diff --git a/clang/lib/Headers/amxmovrstransposeintrin.h b/clang/lib/Headers/amxmovrstransposeintrin.h new file mode 100644 index 0000000000000..84360e6bef838 --- /dev/null +++ b/clang/lib/Headers/amxmovrstransposeintrin.h @@ -0,0 +1,201 @@ +/* ===--- amxmovrstransposeintrin.h - AMX_MOVRS_TRANSPOSE intrinsics -*- C++ + * -*---------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + * ===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error \ + "Never use directly; use instead." +#endif /* __IMMINTRIN_H */ + +#ifndef __AMX_MOVRS_TRANSPOSEINTRIN_H +#define __AMX_MOVRS_TRANSPOSEINTRIN_H +#ifdef __x86_64__ + +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("amx-transpose,amx-movrs"))) + +#define _tile_2rpntlvwz0rs(tdst, base, stride) \ + __builtin_ia32_t2rpntlvwz0rs(tdst, base, stride) +#define _tile_2rpntlvwz0rst1(tdst, base, stride) \ + __builtin_ia32_t2rpntlvwz0rst1(tdst, base, stride) +#define _tile_2rpntlvwz1rs(tdst, base, stride) \ + __builtin_ia32_t2rpntlvwz1rs(tdst, base, stride) +#define _tile_2rpntlvwz1rst1(tdst, base, stride) \ + __builtin_ia32_t2rpntlvwz1rst1(tdst, base, stride) + +static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz0rs_internal( + unsigned short row, unsigned short col0, unsigned short col1, + _tile1024i *dst0, _tile1024i *dst1, const void *base, + __SIZE_TYPE__ stride) { + // Use __tile1024i_1024a* to escape the alignment check in + // clang/test/Headers/x86-intrinsics-headers-clean.cpp + __builtin_ia32_t2rpntlvwz0rs_internal( + row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, + (__SIZE_TYPE__)(stride)); +} + +static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz0rst1_internal( + unsigned short row, unsigned short col0, unsigned short col1, + _tile1024i *dst0, _tile1024i *dst1, const void *base, + __SIZE_TYPE__ stride) { + __builtin_ia32_t2rpntlvwz0rst1_internal( + row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, + (__SIZE_TYPE__)(stride)); +} + +static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz1rs_internal( + unsigned short row, unsigned short col0, unsigned short col1, + _tile1024i *dst0, _tile1024i *dst1, const void *base, + __SIZE_TYPE__ stride) { + __builtin_ia32_t2rpntlvwz1rs_internal( + row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, + (__SIZE_TYPE__)(stride)); +} + +static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz1rst1_internal( + unsigned short row, unsigned short col0, unsigned short col1, + _tile1024i *dst0, _tile1024i *dst1, const void *base, + __SIZE_TYPE__ stride) { + __builtin_ia32_t2rpntlvwz1rst1_internal( + row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, + (__SIZE_TYPE__)(stride)); +} + +/// Converts a pair of tiles from memory into VNNI format, and places the +/// results in a pair of destinations specified by dst. The pair of tiles +/// in memory is specified via a tsib; the second tile is after the first +/// one, separated by the same stride that separates each row. +/// The tile configuration for the destination tiles indicates the amount +/// of data to read from memory. The instruction will load a number of rows +/// that is equal to twice the number of rows in tmm1. The size of each row +/// is equal to the average width of the destination tiles. If the second +/// tile is configured with zero rows and columns, only the first tile will +/// be written. +/// Provides a hint to the implementation that the data will likely become +/// read shared in the near future and the data caching can be optimized. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the T2RPNTLVWZ0RS instruction. +/// +/// \param dst0 +/// First tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param dst1 +/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS +static void __tile_2rpntlvwz0rs(__tile1024i *dst0, __tile1024i *dst1, + const void *base, __SIZE_TYPE__ stride) { + _tile_2rpntlvwz0rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, + &dst1->tile, base, stride); +} + +/// Converts a pair of tiles from memory into VNNI format, and places the +/// results in a pair of destinations specified by dst. The pair of tiles +/// in memory is specified via a tsib; the second tile is after the first +/// one, separated by the same stride that separates each row. +/// The tile configuration for the destination tiles indicates the amount +/// of data to read from memory. The instruction will load a number of rows +/// that is equal to twice the number of rows in tmm1. The size of each row +/// is equal to the average width of the destination tiles. If the second +/// tile is configured with zero rows and columns, only the first tile will +/// be written. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the T2RPNTLVWZ0T1RS instruction. +/// +/// \param dst0 +/// First tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param dst1 +/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS +static void __tile_2rpntlvwz0rst1(__tile1024i *dst0, __tile1024i *dst1, + const void *base, __SIZE_TYPE__ stride) { + _tile_2rpntlvwz0rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, + &dst1->tile, base, stride); +} + +/// Converts a pair of tiles from memory into VNNI format, and places the +/// results in a pair of destinations specified by dst. The pair of tiles +/// in memory is specified via a tsib; the second tile is after the first +/// one, separated by the same stride that separates each row. +/// The tile configuration for the destination tiles indicates the amount +/// of data to read from memory. The instruction will load a number of rows +/// that is equal to twice the number of rows in tmm1. The size of each row +/// is equal to the average width of the destination tiles. If the second +/// tile is configured with zero rows and columns, only the first tile will +/// be written. The last row will be not be read from memory but instead +/// filled with zeros. +/// Provides a hint to the implementation that the data will likely become +/// read shared in the near future and the data caching can be optimized. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the T2RPNTLVWZ1 instruction. +/// +/// \param dst0 +/// First tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param dst1 +/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS +static void __tile_2rpntlvwz1rs(__tile1024i *dst0, __tile1024i *dst1, + const void *base, __SIZE_TYPE__ stride) { + _tile_2rpntlvwz1rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, + &dst1->tile, base, stride); +} + +/// Converts a pair of tiles from memory into VNNI format, and places the +/// results in a pair of destinations specified by dst. The pair of tiles +/// in memory is specified via a tsib; the second tile is after the first +/// one, separated by the same stride that separates each row. +/// The tile configuration for the destination tiles indicates the amount +/// of data to read from memory. The instruction will load a number of rows +/// that is equal to twice the number of rows in tmm1. The size of each row +/// is equal to the average width of the destination tiles. If the second +/// tile is configured with zero rows and columns, only the first tile will +/// be written. The last row will be not be read from memory but instead +/// filled with zeros. +/// Provides a hint to the implementation that the data will likely become +/// read shared in the near future and the data caching can be optimized. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the T2RPNTLVWZ1T1RS instruction. +/// +/// \param dst0 +/// First tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param dst1 +/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS +static void __tile_2rpntlvwz1rst1(__tile1024i *dst0, __tile1024i *dst1, + const void *base, __SIZE_TYPE__ stride) { + _tile_2rpntlvwz1rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, + &dst1->tile, base, stride); +} + +#undef __DEFAULT_FN_ATTRS +#endif /* __x86_64__ */ +#endif /* __AMX_MOVRS_TRANSPOSEINTRIN_H */ \ No newline at end of file diff --git a/clang/lib/Headers/amxtransposeintrin.h b/clang/lib/Headers/amxtransposeintrin.h index 086c9a75222ca..b3fa37d766c45 100644 --- a/clang/lib/Headers/amxtransposeintrin.h +++ b/clang/lib/Headers/amxtransposeintrin.h @@ -17,9 +17,6 @@ #define __DEFAULT_FN_ATTRS_TRANSPOSE \ __attribute__((__always_inline__, __nodebug__, __target__("amx-transpose"))) -#define __DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS \ - __attribute__((__always_inline__, __nodebug__, \ - __target__("amx-transpose,amx-movrs"))) #define _tile_2rpntlvwz0(tdst, base, stride) \ __builtin_ia32_t2rpntlvwz0(tdst, base, stride) @@ -29,15 +26,6 @@ __builtin_ia32_t2rpntlvwz1(tdst, base, stride) #define _tile_2rpntlvwz1t1(tdst, base, stride) \ __builtin_ia32_t2rpntlvwz1t1(tdst, base, stride) -// MOVRS versions -#define _tile_2rpntlvwz0rs(tdst, base, stride) \ - __builtin_ia32_t2rpntlvwz0rs(tdst, base, stride) -#define _tile_2rpntlvwz0rst1(tdst, base, stride) \ - __builtin_ia32_t2rpntlvwz0rst1(tdst, base, stride) -#define _tile_2rpntlvwz1rs(tdst, base, stride) \ - __builtin_ia32_t2rpntlvwz1rs(tdst, base, stride) -#define _tile_2rpntlvwz1rst1(tdst, base, stride) \ - __builtin_ia32_t2rpntlvwz1rst1(tdst, base, stride) /// Transpose 32-bit elements from \a src and write the result to \a dst. /// @@ -113,45 +101,6 @@ _tile_transposed_internal(unsigned short m, unsigned short n, _tile1024i src) { return __builtin_ia32_ttransposed_internal(m, n, src); } -static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS -_tile_2rpntlvwz0rs_internal(unsigned short row, unsigned short col0, - unsigned short col1, _tile1024i *dst0, - _tile1024i *dst1, const void *base, - __SIZE_TYPE__ stride) { - // Use __tile1024i_1024a* to escape the alignment check in - // clang/test/Headers/x86-intrinsics-headers-clean.cpp - __builtin_ia32_t2rpntlvwz0rs_internal( - row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, - (__SIZE_TYPE__)(stride)); -} -static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS -_tile_2rpntlvwz0rst1_internal(unsigned short row, unsigned short col0, - unsigned short col1, _tile1024i *dst0, - _tile1024i *dst1, const void *base, - __SIZE_TYPE__ stride) { - __builtin_ia32_t2rpntlvwz0rst1_internal( - row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, - (__SIZE_TYPE__)(stride)); -} -static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS -_tile_2rpntlvwz1rs_internal(unsigned short row, unsigned short col0, - unsigned short col1, _tile1024i *dst0, - _tile1024i *dst1, const void *base, - __SIZE_TYPE__ stride) { - __builtin_ia32_t2rpntlvwz1rs_internal( - row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, - (__SIZE_TYPE__)(stride)); -} -static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS -_tile_2rpntlvwz1rst1_internal(unsigned short row, unsigned short col0, - unsigned short col1, _tile1024i *dst0, - _tile1024i *dst1, const void *base, - __SIZE_TYPE__ stride) { - __builtin_ia32_t2rpntlvwz1rst1_internal( - row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, - (__SIZE_TYPE__)(stride)); -} - /// Converts a pair of tiles from memory into VNNI format, and places the /// results in a pair of destinations specified by dst. The pair of tiles /// in memory is specified via a tsib; the second tile is after the first @@ -280,131 +229,6 @@ static void __tile_2rpntlvwz1t1(__tile1024i *dst0, __tile1024i *dst1, &dst1->tile, base, stride); } -/// Converts a pair of tiles from memory into VNNI format, and places the -/// results in a pair of destinations specified by dst. The pair of tiles -/// in memory is specified via a tsib; the second tile is after the first -/// one, separated by the same stride that separates each row. -/// The tile configuration for the destination tiles indicates the amount -/// of data to read from memory. The instruction will load a number of rows -/// that is equal to twice the number of rows in tmm1. The size of each row -/// is equal to the average width of the destination tiles. If the second -/// tile is configured with zero rows and columns, only the first tile will -/// be written. -/// Provides a hint to the implementation that the data will likely become -/// read shared in the near future and the data caching can be optimized. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the T2RPNTLVWZ0RS instruction. -/// -/// \param dst0 -/// First tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param dst1 -/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param base -/// A pointer to base address. -/// \param stride -/// The stride between the rows' data to be loaded in memory. -__DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS -static void __tile_2rpntlvwz0rs(__tile1024i *dst0, __tile1024i *dst1, - const void *base, __SIZE_TYPE__ stride) { - _tile_2rpntlvwz0rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, - &dst1->tile, base, stride); -} -/// Converts a pair of tiles from memory into VNNI format, and places the -/// results in a pair of destinations specified by dst. The pair of tiles -/// in memory is specified via a tsib; the second tile is after the first -/// one, separated by the same stride that separates each row. -/// The tile configuration for the destination tiles indicates the amount -/// of data to read from memory. The instruction will load a number of rows -/// that is equal to twice the number of rows in tmm1. The size of each row -/// is equal to the average width of the destination tiles. If the second -/// tile is configured with zero rows and columns, only the first tile will -/// be written. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the T2RPNTLVWZ0T1RS instruction. -/// -/// \param dst0 -/// First tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param dst1 -/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param base -/// A pointer to base address. -/// \param stride -/// The stride between the rows' data to be loaded in memory. -__DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS -static void __tile_2rpntlvwz0rst1(__tile1024i *dst0, __tile1024i *dst1, - const void *base, __SIZE_TYPE__ stride) { - _tile_2rpntlvwz0rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, - &dst1->tile, base, stride); -} -/// Converts a pair of tiles from memory into VNNI format, and places the -/// results in a pair of destinations specified by dst. The pair of tiles -/// in memory is specified via a tsib; the second tile is after the first -/// one, separated by the same stride that separates each row. -/// The tile configuration for the destination tiles indicates the amount -/// of data to read from memory. The instruction will load a number of rows -/// that is equal to twice the number of rows in tmm1. The size of each row -/// is equal to the average width of the destination tiles. If the second -/// tile is configured with zero rows and columns, only the first tile will -/// be written. The last row will be not be read from memory but instead -/// filled with zeros. -/// Provides a hint to the implementation that the data will likely become -/// read shared in the near future and the data caching can be optimized. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the T2RPNTLVWZ1 instruction. -/// -/// \param dst0 -/// First tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param dst1 -/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param base -/// A pointer to base address. -/// \param stride -/// The stride between the rows' data to be loaded in memory. -__DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS -static void __tile_2rpntlvwz1rs(__tile1024i *dst0, __tile1024i *dst1, - const void *base, __SIZE_TYPE__ stride) { - _tile_2rpntlvwz1rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, - &dst1->tile, base, stride); -} -/// Converts a pair of tiles from memory into VNNI format, and places the -/// results in a pair of destinations specified by dst. The pair of tiles -/// in memory is specified via a tsib; the second tile is after the first -/// one, separated by the same stride that separates each row. -/// The tile configuration for the destination tiles indicates the amount -/// of data to read from memory. The instruction will load a number of rows -/// that is equal to twice the number of rows in tmm1. The size of each row -/// is equal to the average width of the destination tiles. If the second -/// tile is configured with zero rows and columns, only the first tile will -/// be written. The last row will be not be read from memory but instead -/// filled with zeros. -/// Provides a hint to the implementation that the data will likely become -/// read shared in the near future and the data caching can be optimized. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the T2RPNTLVWZ1T1RS instruction. -/// -/// \param dst0 -/// First tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param dst1 -/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param base -/// A pointer to base address. -/// \param stride -/// The stride between the rows' data to be loaded in memory. -__DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS -static void __tile_2rpntlvwz1rst1(__tile1024i *dst0, __tile1024i *dst1, - const void *base, __SIZE_TYPE__ stride) { - _tile_2rpntlvwz1rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, - &dst1->tile, base, stride); -} - /// Transpose 32-bit elements from src and write the result to dst. /// /// \headerfile @@ -420,6 +244,5 @@ static void __tile_transposed(__tile1024i *dst, __tile1024i src) { dst->tile = _tile_transposed_internal(dst->row, dst->col, src.tile); } -#undef __DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS #endif /* __x86_64__ */ #endif /* __AMX_TRANSPOSEINTRIN_H */ diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h index 37e6ff071e26b..574dc79fa1b24 100644 --- a/clang/lib/Headers/immintrin.h +++ b/clang/lib/Headers/immintrin.h @@ -658,6 +658,7 @@ _storebe_i64(void * __P, long long __D) { #if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_MOVRS__) #include +#include #endif #if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_AVX512__) diff --git a/clang/test/CodeGen/X86/amx_movrs_errors.c b/clang/test/CodeGen/X86/amx_movrs_errors.c index 2790126eb8672..4263e75ce9a28 100755 --- a/clang/test/CodeGen/X86/amx_movrs_errors.c +++ b/clang/test/CodeGen/X86/amx_movrs_errors.c @@ -1,7 +1,7 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 // RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \ -// RUN: -target-feature +amx-int8 -target-feature +amx-bf16 -target-feature +amx-reduce -target-feature +amx-memory \ -// RUN: -target-feature +amx-format -target-feature +amx-element -verify +// RUN: -target-feature +amx-int8 -target-feature +amx-bf16 \ +// RUN: -target-feature +amx-element -verify #include #include diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def index 815556e374bef..026db53b2d926 100644 --- a/llvm/include/llvm/TargetParser/X86TargetParser.def +++ b/llvm/include/llvm/TargetParser/X86TargetParser.def @@ -266,6 +266,7 @@ X86_FEATURE (MOVRS, "movrs") X86_FEATURE (ZU, "zu") X86_FEATURE (AMX_FP8, "amx-fp8") X86_FEATURE (AMX_TRANSPOSE, "amx-transpose") +X86_FEATURE (AMX_MOVRS, "amx-movrs") X86_FEATURE (AMX_AVX512, "amx-avx512") // These features aren't really CPU features, but the frontend can set them. X86_FEATURE (RETPOLINE_EXTERNAL_THUNK, "retpoline-external-thunk") diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp index 3648a828b6d1a..7c7c5f642a703 100644 --- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -557,17 +557,10 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, MI.setDesc(TII->get(GET_EGPR_IF_ENABLED(X86::LDTILECFG))); return true; } - case X86::PTILELOADDRSV: - case X86::PTILELOADDRST1V: { - for (unsigned i = 2; i > 0; --i) - MI.removeOperand(i); - unsigned Opc = - Opcode == X86::PTILELOADDRSV ? X86::TILELOADDRS : X86::TILELOADDRST1; - MI.setDesc(TII->get(Opc)); - return true; - } case X86::PTILELOADDV: case X86::PTILELOADDT1V: + case X86::PTILELOADDRSV: + case X86::PTILELOADDRST1V: case X86::PTCVTROWD2PSrreV: case X86::PTCVTROWD2PSrriV: case X86::PTCVTROWPS2PBF16HrreV: @@ -584,6 +577,12 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, MI.removeOperand(i); unsigned Opc; switch (Opcode) { + case X86::PTILELOADDRSV: + Opc = X86::TILELOADDRS; + break; + case X86::PTILELOADDRST1V: + Opc = X86::TILELOADDRST1; + break; case X86::PTILELOADDV: Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD); break; @@ -728,7 +727,11 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, case X86::PT2RPNTLVWZ0V: case X86::PT2RPNTLVWZ0T1V: case X86::PT2RPNTLVWZ1V: - case X86::PT2RPNTLVWZ1T1V: { + case X86::PT2RPNTLVWZ1T1V: + case X86::PT2RPNTLVWZ0RSV: + case X86::PT2RPNTLVWZ0RST1V: + case X86::PT2RPNTLVWZ1RSV: + case X86::PT2RPNTLVWZ1RST1V: { for (unsigned i = 3; i > 0; --i) MI.removeOperand(i); unsigned Opc; @@ -745,20 +748,6 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, case X86::PT2RPNTLVWZ1T1V: Opc = X86::T2RPNTLVWZ1T1; break; - default: - llvm_unreachable("Impossible Opcode!"); - } - MI.setDesc(TII->get(Opc)); - return true; - } - case X86::PT2RPNTLVWZ0RSV: - case X86::PT2RPNTLVWZ0RST1V: - case X86::PT2RPNTLVWZ1RSV: - case X86::PT2RPNTLVWZ1RST1V: { - for (unsigned i = 3; i > 0; --i) - MI.removeOperand(i); - unsigned Opc; - switch (Opcode) { case X86::PT2RPNTLVWZ0RSV: Opc = X86::T2RPNTLVWZ0RS; break; diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 96df1d8b464a2..e923d9438e626 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -5161,6 +5161,11 @@ void X86DAGToDAGISel::Select(SDNode *Node) { ReplaceNode(Node, Res); return; } + case Intrinsic::x86_tileloaddrs64_internal: + case Intrinsic::x86_tileloaddrst164_internal: + if (!Subtarget->hasAMXMOVRS()) + break; + [[fallthrough]]; case Intrinsic::x86_tileloadd64_internal: case Intrinsic::x86_tileloaddt164_internal: { if (!Subtarget->hasAMXTILE()) @@ -5168,36 +5173,23 @@ void X86DAGToDAGISel::Select(SDNode *Node) { auto *MFI = CurDAG->getMachineFunction().getInfo(); MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA); - unsigned Opc = IntNo == Intrinsic::x86_tileloadd64_internal - ? X86::PTILELOADDV - : X86::PTILELOADDT1V; - // _tile_loadd_internal(row, col, buf, STRIDE) - SDValue Base = Node->getOperand(4); - SDValue Scale = getI8Imm(1, dl); - SDValue Index = Node->getOperand(5); - SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); - SDValue Segment = CurDAG->getRegister(0, MVT::i16); - SDValue Chain = Node->getOperand(0); - MachineSDNode *CNode; - SDValue Ops[] = {Node->getOperand(2), - Node->getOperand(3), - Base, - Scale, - Index, - Disp, - Segment, - Chain}; - CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops); - ReplaceNode(Node, CNode); - return; - } - case Intrinsic::x86_tileloaddrs64_internal: - case Intrinsic::x86_tileloaddrst164_internal: { - if (!Subtarget->hasAMXMOVRS()) + unsigned Opc; + switch (IntNo) { + default: + llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::x86_tileloaddrs64_internal: + Opc = X86::PTILELOADDRSV; + break; + case Intrinsic::x86_tileloaddrst164_internal: + Opc = X86::PTILELOADDRST1V; break; - unsigned Opc = IntNo == Intrinsic::x86_tileloaddrs64_internal - ? X86::PTILELOADDRSV - : X86::PTILELOADDRST1V; + case Intrinsic::x86_tileloadd64_internal: + Opc = X86::PTILELOADDV; + break; + case Intrinsic::x86_tileloaddt164_internal: + Opc = X86::PTILELOADDT1V; + break; + } // _tile_loadd_internal(row, col, buf, STRIDE) SDValue Base = Node->getOperand(4); SDValue Scale = getI8Imm(1, dl); @@ -5301,6 +5293,11 @@ void X86DAGToDAGISel::Select(SDNode *Node) { ReplaceNode(Node, CNode); return; } + case Intrinsic::x86_tileloaddrs64: + case Intrinsic::x86_tileloaddrst164: + if (!Subtarget->hasAMXMOVRS()) + break; + [[fallthrough]]; case Intrinsic::x86_tileloadd64: case Intrinsic::x86_tileloaddt164: case Intrinsic::x86_tilestored64: { @@ -5313,46 +5310,14 @@ void X86DAGToDAGISel::Select(SDNode *Node) { switch (IntNo) { default: llvm_unreachable("Unexpected intrinsic!"); case Intrinsic::x86_tileloadd64: Opc = X86::PTILELOADD; break; - case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break; - case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break; - } - // FIXME: Match displacement and scale. - unsigned TIndex = Node->getConstantOperandVal(2); - SDValue TReg = getI8Imm(TIndex, dl); - SDValue Base = Node->getOperand(3); - SDValue Scale = getI8Imm(1, dl); - SDValue Index = Node->getOperand(4); - SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); - SDValue Segment = CurDAG->getRegister(0, MVT::i16); - SDValue Chain = Node->getOperand(0); - MachineSDNode *CNode; - if (Opc == X86::PTILESTORED) { - SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain }; - CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); - } else { - SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain }; - CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); - } - ReplaceNode(Node, CNode); - return; - } - case Intrinsic::x86_tileloaddrs64: - case Intrinsic::x86_tileloaddrst164: { - if (!Subtarget->hasAMXMOVRS()) - break; - auto *MFI = - CurDAG->getMachineFunction().getInfo(); - MFI->setAMXProgModel(AMXProgModelEnum::DirectReg); - unsigned Opc; - switch (IntNo) { - default: - llvm_unreachable("Unexpected intrinsic!"); case Intrinsic::x86_tileloaddrs64: Opc = X86::PTILELOADDRS; break; + case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break; case Intrinsic::x86_tileloaddrst164: Opc = X86::PTILELOADDRST1; break; + case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break; } // FIXME: Match displacement and scale. unsigned TIndex = Node->getConstantOperandVal(2); @@ -5365,15 +5330,22 @@ void X86DAGToDAGISel::Select(SDNode *Node) { SDValue Chain = Node->getOperand(0); MachineSDNode *CNode; if (Opc == X86::PTILESTORED) { - SDValue Ops[] = {Base, Scale, Index, Disp, Segment, TReg, Chain}; + SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain }; CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); } else { - SDValue Ops[] = {TReg, Base, Scale, Index, Disp, Segment, Chain}; + SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain }; CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); } ReplaceNode(Node, CNode); return; } + case Intrinsic::x86_t2rpntlvwz0rs: + case Intrinsic::x86_t2rpntlvwz0rst1: + case Intrinsic::x86_t2rpntlvwz1rs: + case Intrinsic::x86_t2rpntlvwz1rst1: + if (!Subtarget->hasAMXTRANSPOSE() || !Subtarget->hasAMXMOVRS()) + break; + [[fallthrough]]; case Intrinsic::x86_t2rpntlvwz0: case Intrinsic::x86_t2rpntlvwz0t1: case Intrinsic::x86_t2rpntlvwz1: @@ -5399,31 +5371,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) { case Intrinsic::x86_t2rpntlvwz1t1: Opc = X86::PT2RPNTLVWZ1T1; break; - } - // FIXME: Match displacement and scale. - unsigned TIndex = Node->getConstantOperandVal(2); - SDValue TReg = getI8Imm(TIndex, dl); - SDValue Base = Node->getOperand(3); - SDValue Scale = getI8Imm(1, dl); - SDValue Index = Node->getOperand(4); - SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); - SDValue Segment = CurDAG->getRegister(0, MVT::i16); - SDValue Chain = Node->getOperand(0); - SDValue Ops[] = {TReg, Base, Scale, Index, Disp, Segment, Chain}; - MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); - ReplaceNode(Node, CNode); - return; - } - case Intrinsic::x86_t2rpntlvwz0rs: - case Intrinsic::x86_t2rpntlvwz0rst1: - case Intrinsic::x86_t2rpntlvwz1rs: - case Intrinsic::x86_t2rpntlvwz1rst1: { - if (!Subtarget->hasAMXTRANSPOSE() || !Subtarget->hasAMXMOVRS()) - break; - unsigned Opc; - switch (IntNo) { - default: - llvm_unreachable("Unexpected intrinsic!"); case Intrinsic::x86_t2rpntlvwz0rs: Opc = X86::PT2RPNTLVWZ0RS; break; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 10062ff81c470..839fedc34d1d0 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -27328,9 +27328,6 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, case Intrinsic::x86_t2rpntlvwz0rst1_internal: case Intrinsic::x86_t2rpntlvwz1rs_internal: case Intrinsic::x86_t2rpntlvwz1rst1_internal: - if (!Subtarget.hasAMXTRANSPOSE() || !Subtarget.hasAMXMOVRS()) - break; - [[fallthrough]]; case Intrinsic::x86_t2rpntlvwz0_internal: case Intrinsic::x86_t2rpntlvwz0t1_internal: case Intrinsic::x86_t2rpntlvwz1_internal: @@ -37527,6 +37524,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA); return BB; } + case X86::PTILELOADDRS: + case X86::PTILELOADDRST1: case X86::PTILELOADD: case X86::PTILELOADDT1: case X86::PTILESTORED: { @@ -37544,33 +37543,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED); break; #undef GET_EGPR_IF_ENABLED - } - - MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc)); - unsigned CurOp = 0; - if (Opc != X86::TILESTORED && Opc != X86::TILESTORED_EVEX) - MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()), - RegState::Define); - - MIB.add(MI.getOperand(CurOp++)); // base - MIB.add(MI.getOperand(CurOp++)); // scale - MIB.add(MI.getOperand(CurOp++)); // index -- stride - MIB.add(MI.getOperand(CurOp++)); // displacement - MIB.add(MI.getOperand(CurOp++)); // segment - - if (Opc == X86::TILESTORED || Opc == X86::TILESTORED_EVEX) - MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()), - RegState::Undef); - - MI.eraseFromParent(); // The pseudo is gone now. - return BB; - } - case X86::PTILELOADDRS: - case X86::PTILELOADDRST1: { - unsigned Opc; - switch (MI.getOpcode()) { - default: - llvm_unreachable("illegal opcode!"); case X86::PTILELOADDRS: Opc = X86::TILELOADDRS; break; @@ -37578,19 +37550,23 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, Opc = X86::TILELOADDRST1; break; } + MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc)); unsigned CurOp = 0; - if (Opc != X86::TILESTORED) + if (Opc != X86::TILESTORED && Opc != X86::TILESTORED_EVEX) MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()), RegState::Define); + MIB.add(MI.getOperand(CurOp++)); // base MIB.add(MI.getOperand(CurOp++)); // scale MIB.add(MI.getOperand(CurOp++)); // index -- stride MIB.add(MI.getOperand(CurOp++)); // displacement MIB.add(MI.getOperand(CurOp++)); // segment - if (Opc == X86::TILESTORED) + + if (Opc == X86::TILESTORED || Opc == X86::TILESTORED_EVEX) MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()), RegState::Undef); + MI.eraseFromParent(); // The pseudo is gone now. return BB; } @@ -37613,6 +37589,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MI.eraseFromParent(); // The pseudo is gone now. return BB; } + case X86::PT2RPNTLVWZ0RS: + case X86::PT2RPNTLVWZ0RST1: + case X86::PT2RPNTLVWZ1RS: + case X86::PT2RPNTLVWZ1RST1: case X86::PT2RPNTLVWZ0: case X86::PT2RPNTLVWZ0T1: case X86::PT2RPNTLVWZ1: @@ -37634,27 +37614,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case X86::PT2RPNTLVWZ1T1: Opc = X86::T2RPNTLVWZ1T1; break; - } - MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc)); - MIB.addReg(TMMImmToTMMPair(MI.getOperand(0).getImm()), RegState::Define); - - MIB.add(MI.getOperand(1)); // base - MIB.add(MI.getOperand(2)); // scale - MIB.add(MI.getOperand(3)); // index - MIB.add(MI.getOperand(4)); // displacement - MIB.add(MI.getOperand(5)); // segment - MI.eraseFromParent(); // The pseudo is gone now. - return BB; - } - case X86::PT2RPNTLVWZ0RS: - case X86::PT2RPNTLVWZ0RST1: - case X86::PT2RPNTLVWZ1RS: - case X86::PT2RPNTLVWZ1RST1: { - const DebugLoc &DL = MI.getDebugLoc(); - unsigned Opc; - switch (MI.getOpcode()) { - default: - llvm_unreachable("Unexpected instruction!"); case X86::PT2RPNTLVWZ0RS: Opc = X86::T2RPNTLVWZ0RS; break; diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td index 37bdc71fb5da2..68ba40eb315d7 100644 --- a/llvm/lib/Target/X86/X86InstrAMX.td +++ b/llvm/lib/Target/X86/X86InstrAMX.td @@ -433,32 +433,6 @@ let Predicates = [HasAMXMOVRS, In64BitMode], SchedRW = [WriteSystem] in { let mayLoad = 1 in def PTILELOADDRST1 : PseudoI<(outs), (ins u8imm:$src1, sibmem:$src2), []>; } - - def TILELOADDRSrm_EVEX : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst), - (ins sibmem:$src1), - "tileloaddrs\t{$src1, $dst|$dst, $src1}", - []>, EVEX, NoCD8, T8, XD; - def TILELOADDRST1rm_EVEX : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst), - (ins sibmem:$src1), - "tileloaddrst1\t{$src1, $dst|$dst, $src1}", - []>, EVEX, NoCD8, T8, PD; - - def T2RPNTLVWZ0RS_EVEX : I<0xf8, MRMSrcMemFSIB, (outs TILEPair:$dst), - (ins sibmem:$src1), - "t2rpntlvwz0rs\t{$src1, $dst|$dst, $src1}", - []>, EVEX, NoCD8, T_MAP5; - def T2RPNTLVWZ0RST1_EVEX : I<0xf9, MRMSrcMemFSIB, (outs TILEPair:$dst), - (ins sibmem:$src1), - "t2rpntlvwz0rst1\t{$src1, $dst|$dst, $src1}", - []>, EVEX, NoCD8, T_MAP5; - def T2RPNTLVWZ1RS_EVEX : I<0xf8, MRMSrcMemFSIB, (outs TILEPair:$dst), - (ins sibmem:$src1), - "t2rpntlvwz1rs\t{$src1, $dst|$dst, $src1}", - []>, EVEX, NoCD8, T_MAP5, PD; - def T2RPNTLVWZ1RST1_EVEX : I<0xf9, MRMSrcMemFSIB, (outs TILEPair:$dst), - (ins sibmem:$src1), - "t2rpntlvwz1rst1\t{$src1, $dst|$dst, $src1}", - []>, EVEX, NoCD8, T_MAP5, PD; } // HasAMXMOVRS, In64BitMode multiclass m_tcvtrowd2ps { diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 850f74e666adc..1b95450596314 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -4737,7 +4737,6 @@ static bool isAMXOpcode(unsigned Opc) { case X86::TILELOADD_EVEX: case X86::TILESTORED_EVEX: case X86::PTILEPAIRLOAD: - case X86::TILELOADDRS: case X86::PTILEPAIRSTORE: return true; } diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp index a973aaaa4806e..a3322f7d32846 100644 --- a/llvm/lib/TargetParser/Host.cpp +++ b/llvm/lib/TargetParser/Host.cpp @@ -1880,6 +1880,7 @@ const StringMap sys::getHostCPUFeatures() { !getX86CpuIDAndInfoEx(0x1e, 0x1, &EAX, &EBX, &ECX, &EDX); Features["amx-fp8"] = HasLeaf1E && ((EAX >> 4) & 1) && HasAMXSave; Features["amx-transpose"] = HasLeaf1E && ((EAX >> 5) & 1) && HasAMXSave; + Features["amx-movrs"] = HasLeaf1E && ((EAX >> 8) & 1) && HasAMXSave; Features["amx-avx512"] = HasLeaf1E && ((EAX >> 7) & 1) && HasAMXSave; bool HasLeaf24 = diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp index eb55e6fc9134c..4039e6a0243cc 100644 --- a/llvm/lib/TargetParser/X86TargetParser.cpp +++ b/llvm/lib/TargetParser/X86TargetParser.cpp @@ -600,6 +600,7 @@ constexpr FeatureBitset ImpliedFeaturesAMX_INT8 = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_COMPLEX = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_FP8 = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_TRANSPOSE = FeatureAMX_TILE; +constexpr FeatureBitset ImpliedFeaturesAMX_MOVRS = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_AVX512 = FeatureAMX_TILE | FeatureAVX10_2_512; constexpr FeatureBitset ImpliedFeaturesHRESET = {}; From cd6090f5ee51c45f961bc6ef47169d85d75bf78a Mon Sep 17 00:00:00 2001 From: Malay Sanghi Date: Mon, 11 Nov 2024 20:45:10 -0800 Subject: [PATCH 4/4] review2 --- clang/lib/Headers/amxmovrstransposeintrin.h | 3 +-- clang/lib/Headers/immintrin.h | 4 ++++ clang/test/CodeGen/X86/amx_movrs_errors.c | 3 +-- llvm/lib/Target/X86/X86.td | 4 ++-- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 2 +- llvm/lib/Target/X86/X86InstrAMX.td | 17 ++++++++--------- llvm/lib/TargetParser/Host.cpp | 2 +- 7 files changed, 18 insertions(+), 17 deletions(-) diff --git a/clang/lib/Headers/amxmovrstransposeintrin.h b/clang/lib/Headers/amxmovrstransposeintrin.h index 84360e6bef838..17a9f7506a042 100644 --- a/clang/lib/Headers/amxmovrstransposeintrin.h +++ b/clang/lib/Headers/amxmovrstransposeintrin.h @@ -1,5 +1,4 @@ -/* ===--- amxmovrstransposeintrin.h - AMX_MOVRS_TRANSPOSE intrinsics -*- C++ - * -*---------=== +/* ===--- amxmovrstransposeintrin.h - AMX_MOVRS_TRANSPOSE intrinsics --------=== * * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h index 502e1c12a4e91..f0dd7160ec7ff 100644 --- a/clang/lib/Headers/immintrin.h +++ b/clang/lib/Headers/immintrin.h @@ -658,6 +658,10 @@ _storebe_i64(void * __P, long long __D) { #if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_MOVRS__) #include +#endif + +#if !defined(__SCE__) || __has_feature(modules) || \ + (defined(__AMX_MOVRS__) && defined(__AMX_TRANSPOSE__)) #include #endif diff --git a/clang/test/CodeGen/X86/amx_movrs_errors.c b/clang/test/CodeGen/X86/amx_movrs_errors.c index 4263e75ce9a28..495ea299236e2 100755 --- a/clang/test/CodeGen/X86/amx_movrs_errors.c +++ b/clang/test/CodeGen/X86/amx_movrs_errors.c @@ -1,7 +1,6 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 // RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \ -// RUN: -target-feature +amx-int8 -target-feature +amx-bf16 \ -// RUN: -target-feature +amx-element -verify +// RUN: -target-feature +amx-movrs -verify #include #include diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 3b95d06c464a3..509632183dc01 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -274,8 +274,8 @@ def FeatureAMXFP8 : SubtargetFeature<"amx-fp8", "HasAMXFP8", "true", "Support AMX-FP8 instructions", [FeatureAMXTILE]>; def FeatureAMXMOVRS : SubtargetFeature<"amx-movrs", "HasAMXMOVRS", "true", - "Support AMX-MOVRS instructions", - [FeatureAMXTILE]>; + "Support AMX-MOVRS instructions", + [FeatureAMXTILE]>; def FeatureAMXTRANSPOSE : SubtargetFeature<"amx-transpose", "HasAMXTRANSPOSE", "true", "Support AMX amx-transpose instructions", [FeatureAMXTILE]>; diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index e923d9438e626..72de0e0e8761f 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -5343,7 +5343,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { case Intrinsic::x86_t2rpntlvwz0rst1: case Intrinsic::x86_t2rpntlvwz1rs: case Intrinsic::x86_t2rpntlvwz1rst1: - if (!Subtarget->hasAMXTRANSPOSE() || !Subtarget->hasAMXMOVRS()) + if (!Subtarget->hasAMXMOVRS()) break; [[fallthrough]]; case Intrinsic::x86_t2rpntlvwz0: diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td index 282d85e9a4b6d..059bfb4c70bcf 100644 --- a/llvm/lib/Target/X86/X86InstrAMX.td +++ b/llvm/lib/Target/X86/X86InstrAMX.td @@ -419,18 +419,17 @@ let Predicates = [HasAMXMOVRS, In64BitMode], SchedRW = [WriteSystem] in { "tileloaddrst1\t{$src1, $dst|$dst, $src1}", []>, VEX, T8, PD; - let isPseudo = true, mayLoad = 1 in - def PTILELOADDRSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, - GR16:$src2, - opaquemem:$src3), []>; - let isPseudo = true, mayLoad = 1 in - def PTILELOADDRST1V : PseudoI<(outs TILE:$dst), (ins GR16:$src1, + let isPseudo = true, mayLoad = 1 in { + def PTILELOADDRSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, GR16:$src2, opaquemem:$src3), []>; - let usesCustomInserter = 1 in { - let mayLoad = 1 in + def PTILELOADDRST1V : PseudoI<(outs TILE:$dst), (ins GR16:$src1, + GR16:$src2, + opaquemem:$src3), []>; + } + + let usesCustomInserter = 1, mayLoad = 1 in { def PTILELOADDRS : PseudoI<(outs), (ins u8imm:$src1, sibmem:$src2), []>; - let mayLoad = 1 in def PTILELOADDRST1 : PseudoI<(outs), (ins u8imm:$src1, sibmem:$src2), []>; } } // HasAMXMOVRS, In64BitMode diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp index 39115a6d7266a..58ba255363322 100644 --- a/llvm/lib/TargetParser/Host.cpp +++ b/llvm/lib/TargetParser/Host.cpp @@ -1880,9 +1880,9 @@ const StringMap sys::getHostCPUFeatures() { !getX86CpuIDAndInfoEx(0x1e, 0x1, &EAX, &EBX, &ECX, &EDX); Features["amx-fp8"] = HasLeaf1E && ((EAX >> 4) & 1) && HasAMXSave; Features["amx-transpose"] = HasLeaf1E && ((EAX >> 5) & 1) && HasAMXSave; - Features["amx-movrs"] = HasLeaf1E && ((EAX >> 8) & 1) && HasAMXSave; Features["amx-tf32"] = HasLeaf1E && ((EAX >> 6) & 1) && HasAMXSave; Features["amx-avx512"] = HasLeaf1E && ((EAX >> 7) & 1) && HasAMXSave; + Features["amx-movrs"] = HasLeaf1E && ((EAX >> 8) & 1) && HasAMXSave; bool HasLeaf24 = MaxLevel >= 0x24 && !getX86CpuIDAndInfo(0x24, &EAX, &EBX, &ECX, &EDX);