From 1b42b133daaa028e0ac51e3f656aff2faed7a7c6 Mon Sep 17 00:00:00 2001 From: "Wang, Phoebe" Date: Thu, 24 Oct 2024 14:35:37 +0800 Subject: [PATCH 1/3] [X86][AMX] Support AMX-TRANSPOSE Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368 --- clang/docs/ReleaseNotes.rst | 2 + clang/include/clang/Basic/BuiltinsX86_64.def | 11 + clang/include/clang/Driver/Options.td | 2 + clang/lib/Basic/Targets/X86.cpp | 6 + clang/lib/Basic/Targets/X86.h | 1 + clang/lib/CodeGen/CGBuiltin.cpp | 52 ++++ clang/lib/Headers/CMakeLists.txt | 1 + clang/lib/Headers/amxintrin.h | 2 + clang/lib/Headers/amxtransposeintrin.h | 248 ++++++++++++++++++ clang/lib/Headers/immintrin.h | 4 + clang/lib/Sema/SemaX86.cpp | 6 + clang/test/CodeGen/X86/amx_transpose.c | 36 +++ clang/test/CodeGen/X86/amx_transpose_api.c | 66 +++++ clang/test/CodeGen/X86/amx_transpose_errors.c | 31 +++ clang/test/Driver/x86-target-features.c | 7 + clang/test/Preprocessor/x86_target_features.c | 12 + llvm/include/llvm/CodeGen/TileShapeInfo.h | 87 +++++- llvm/include/llvm/IR/IntrinsicsX86.td | 37 +++ .../Support/X86DisassemblerDecoderCommon.h | 1 + .../llvm/TargetParser/X86TargetParser.def | 1 + llvm/lib/Target/X86/AsmParser/X86Operand.h | 31 +++ .../X86/Disassembler/X86Disassembler.cpp | 5 + .../X86/Disassembler/X86DisassemblerDecoder.h | 7 + .../X86/MCTargetDesc/X86InstPrinterCommon.cpp | 19 ++ .../X86/MCTargetDesc/X86InstPrinterCommon.h | 1 + llvm/lib/Target/X86/X86.td | 3 + llvm/lib/Target/X86/X86ExpandPseudo.cpp | 125 +++++++++ llvm/lib/Target/X86/X86FastPreTileConfig.cpp | 53 ++-- llvm/lib/Target/X86/X86FastTileConfig.cpp | 40 ++- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 70 +++++ llvm/lib/Target/X86/X86ISelLowering.cpp | 94 +++++++ llvm/lib/Target/X86/X86InstrAMX.td | 63 +++++ llvm/lib/Target/X86/X86InstrInfo.cpp | 12 +- llvm/lib/Target/X86/X86InstrOperands.td | 7 + llvm/lib/Target/X86/X86InstrPredicates.td | 1 + llvm/lib/Target/X86/X86LowerAMXType.cpp | 246 +++++++++++++---- llvm/lib/Target/X86/X86PreTileConfig.cpp | 45 +++- llvm/lib/Target/X86/X86RegisterInfo.cpp | 60 ++++- llvm/lib/Target/X86/X86RegisterInfo.td | 9 + llvm/lib/Target/X86/X86TileConfig.cpp | 82 +++++- llvm/lib/TargetParser/Host.cpp | 4 + llvm/lib/TargetParser/X86TargetParser.cpp | 1 + .../CodeGen/X86/amx_tile_pair_O2_to_O0.ll | 136 ++++++++++ .../X86/amx_tile_pair_configure_O0.mir | 165 ++++++++++++ .../X86/amx_tile_pair_configure_O2.mir | 153 +++++++++++ llvm/test/CodeGen/X86/amx_tile_pair_copy.mir | 97 +++++++ .../X86/amx_tile_pair_lower_type_O0.ll | 86 ++++++ .../X86/amx_tile_pair_lower_type_O2.ll | 60 +++++ .../X86/amx_tile_pair_preconfigure_O0.mir | 134 ++++++++++ .../X86/amx_tile_pair_preconfigure_O2.mir | 113 ++++++++ .../CodeGen/X86/amx_transpose_intrinsics.ll | 150 +++++++++++ llvm/test/CodeGen/X86/ipra-reg-usage.ll | 4 +- .../MC/Disassembler/X86/amx-transpose-att.s | 57 ++++ .../MC/Disassembler/X86/amx-transpose-att.txt | 58 ++++ .../MC/Disassembler/X86/amx-transpose-intel.s | 57 ++++ llvm/unittests/CodeGen/InstrRefLDVTest.cpp | 6 +- llvm/utils/TableGen/X86RecognizableInstr.cpp | 4 + 57 files changed, 2751 insertions(+), 120 deletions(-) create mode 100644 clang/lib/Headers/amxtransposeintrin.h create mode 100644 clang/test/CodeGen/X86/amx_transpose.c create mode 100644 clang/test/CodeGen/X86/amx_transpose_api.c create mode 100644 clang/test/CodeGen/X86/amx_transpose_errors.c create mode 100644 llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll create mode 100644 llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir create mode 100644 llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir create mode 100644 llvm/test/CodeGen/X86/amx_tile_pair_copy.mir create mode 100644 llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll create mode 100644 llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll create mode 100644 llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir create mode 100644 llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir create mode 100644 llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll create mode 100644 llvm/test/MC/Disassembler/X86/amx-transpose-att.s create mode 100644 llvm/test/MC/Disassembler/X86/amx-transpose-att.txt create mode 100644 llvm/test/MC/Disassembler/X86/amx-transpose-intel.s diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index ce046a305c89b..dc58f98af55cc 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -623,6 +623,8 @@ X86 Support - All intrinsics in tbmintrin.h can now be used in constant expressions. +- Support ISA of ``AMX-TRANSPOSE``. + Arm and AArch64 Support ^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def index 2c591edb2835c..4e95a8a73d550 100644 --- a/clang/include/clang/Basic/BuiltinsX86_64.def +++ b/clang/include/clang/Basic/BuiltinsX86_64.def @@ -128,6 +128,11 @@ TARGET_BUILTIN(__builtin_ia32_tdpbf16ps_internal, "V256iUsUsUsV256iV256iV256i", TARGET_BUILTIN(__builtin_ia32_tdpfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-fp16") TARGET_BUILTIN(__builtin_ia32_tcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex") TARGET_BUILTIN(__builtin_ia32_tcmmrlfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose") +TARGET_BUILTIN(__builtin_ia32_ttransposed_internal, "V256iUsUsV256i", "n", "amx-transpose") // AMX TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile") @@ -148,6 +153,12 @@ TARGET_BUILTIN(__builtin_ia32_ptwrite64, "vUOi", "n", "ptwrite") TARGET_BUILTIN(__builtin_ia32_tcmmimfp16ps, "vIUcIUcIUc", "n", "amx-complex") TARGET_BUILTIN(__builtin_ia32_tcmmrlfp16ps, "vIUcIUcIUc", "n", "amx-complex") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0, "vIUcvC*z", "n", "amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1, "vIUcvC*z", "n","amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1, "vIUcvC*z", "n", "amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1, "vIUcvC*z", "n","amx-transpose") +TARGET_BUILTIN(__builtin_ia32_ttransposed, "vIUcIUc", "n", "amx-transpose") + TARGET_BUILTIN(__builtin_ia32_prefetchi, "vvC*Ui", "nc", "prefetchi") TARGET_BUILTIN(__builtin_ia32_cmpccxadd32, "Siv*SiSiIi", "n", "cmpccxadd") TARGET_BUILTIN(__builtin_ia32_cmpccxadd64, "SLLiv*SLLiSLLiIi", "n", "cmpccxadd") diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 2ddb2f5312148..c55f2b86f4cb1 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -6287,6 +6287,8 @@ def mamx_int8 : Flag<["-"], "mamx-int8">, Group; def mno_amx_int8 : Flag<["-"], "mno-amx-int8">, Group; def mamx_tile : Flag<["-"], "mamx-tile">, Group; def mno_amx_tile : Flag<["-"], "mno-amx-tile">, Group; +def mamx_transpose : Flag<["-"], "mamx-transpose">, Group; +def mno_amx_transpose : Flag<["-"], "mno-amx-transpose">, Group; def mcmpccxadd : Flag<["-"], "mcmpccxadd">, Group; def mno_cmpccxadd : Flag<["-"], "mno-cmpccxadd">, Group; def msse : Flag<["-"], "msse">, Group; diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp index 5448bd841959f..fe5b600e6777f 100644 --- a/clang/lib/Basic/Targets/X86.cpp +++ b/clang/lib/Basic/Targets/X86.cpp @@ -418,6 +418,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector &Features, HasAMXTILE = true; } else if (Feature == "+amx-complex") { HasAMXCOMPLEX = true; + } else if (Feature == "+amx-transpose") { + HasAMXTRANSPOSE = true; } else if (Feature == "+cmpccxadd") { HasCMPCCXADD = true; } else if (Feature == "+raoint") { @@ -935,6 +937,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts, Builder.defineMacro("__AMX_FP16__"); if (HasAMXCOMPLEX) Builder.defineMacro("__AMX_COMPLEX__"); + if (HasAMXTRANSPOSE) + Builder.defineMacro("__AMX_TRANSPOSE__"); if (HasCMPCCXADD) Builder.defineMacro("__CMPCCXADD__"); if (HasRAOINT) @@ -1065,6 +1069,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const { .Case("amx-fp16", true) .Case("amx-int8", true) .Case("amx-tile", true) + .Case("amx-transpose", true) .Case("avx", true) .Case("avx10.1-256", true) .Case("avx10.1-512", true) @@ -1182,6 +1187,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const { .Case("amx-fp16", HasAMXFP16) .Case("amx-int8", HasAMXINT8) .Case("amx-tile", HasAMXTILE) + .Case("amx-transpose", HasAMXTRANSPOSE) .Case("avx", SSELevel >= AVX) .Case("avx10.1-256", HasAVX10_1) .Case("avx10.1-512", HasAVX10_1_512) diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h index a99ae62984c7d..3e1fb41082950 100644 --- a/clang/lib/Basic/Targets/X86.h +++ b/clang/lib/Basic/Targets/X86.h @@ -156,6 +156,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo { bool HasAMXINT8 = false; bool HasAMXBF16 = false; bool HasAMXCOMPLEX = false; + bool HasAMXTRANSPOSE = false; bool HasSERIALIZE = false; bool HasTSXLDTRK = false; bool HasUSERMSR = false; diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 3f28b7f26c36f..67d28ccec0f37 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -16920,6 +16920,58 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, // instruction, but it will create a memset that won't be optimized away. return Builder.CreateMemSet(Ops[0], Ops[1], Ops[2], Align(1), true); } + // Corresponding to intrisics which will return 2 tiles (tile0_tile1). + case X86::BI__builtin_ia32_t2rpntlvwz0_internal: + case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal: + case X86::BI__builtin_ia32_t2rpntlvwz1_internal: + case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: { + Intrinsic::ID IID; + switch (BuiltinID) { + default: + llvm_unreachable("Unsupported intrinsic!"); + case X86::BI__builtin_ia32_t2rpntlvwz0_internal: + IID = Intrinsic::x86_t2rpntlvwz0_internal; + break; + case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal: + IID = Intrinsic::x86_t2rpntlvwz0t1_internal; + break; + case X86::BI__builtin_ia32_t2rpntlvwz1_internal: + IID = Intrinsic::x86_t2rpntlvwz1_internal; + break; + case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: + IID = Intrinsic::x86_t2rpntlvwz1t1_internal; + break; + } + + // Ops = (Row0, Col0, Col1, DstPtr0, DstPtr1, SrcPtr, Stride) + Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), + {Ops[0], Ops[1], Ops[2], Ops[5], Ops[6]}); + + auto *PtrTy = E->getArg(3)->getType()->getAs(); + assert(PtrTy && "arg3 must be of pointer type"); + QualType PtreeTy = PtrTy->getPointeeType(); + llvm::Type *TyPtee = ConvertType(PtreeTy); + + // Bitcast amx type (x86_amx) to vector type (256 x i32) + // Then store tile0 into DstPtr0 + Value *T0 = Builder.CreateExtractValue(Call, 0); + Value *VecT0 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector, + {TyPtee}, {T0}); + Builder.CreateDefaultAlignedStore(VecT0, Ops[3]); + + // Then store tile1 into DstPtr1 + Value *T1 = Builder.CreateExtractValue(Call, 1); + Value *VecT1 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector, + {TyPtee}, {T1}); + Value *Store = Builder.CreateDefaultAlignedStore(VecT1, Ops[4]); + + // Note: Here we escape directly use x86_tilestored64_internal to store + // the results due to it can't make sure the Mem writen scope. This may + // cause shapes reloads after first amx intrinsic, which current amx reg- + // ister allocation has no ability to handle it. + + return Store; + } case X86::BI__ud2: // llvm.trap makes a ud2a instruction on x86. return EmitTrapCall(Intrinsic::trap); diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index ff392e7122a44..708525198324b 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -149,6 +149,7 @@ set(x86_files amxcomplexintrin.h amxfp16intrin.h amxintrin.h + amxtransposeintrin.h avx10_2_512bf16intrin.h avx10_2_512convertintrin.h avx10_2_512minmaxintrin.h diff --git a/clang/lib/Headers/amxintrin.h b/clang/lib/Headers/amxintrin.h index baa56f5b28e8e..f07a568901185 100644 --- a/clang/lib/Headers/amxintrin.h +++ b/clang/lib/Headers/amxintrin.h @@ -232,6 +232,8 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) { /// bytes. Since there is no 2D type in llvm IR, we use vector type to /// represent 2D tile and the fixed size is maximum amx tile register size. typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64))); +typedef int _tile1024i_1024a + __attribute__((__vector_size__(1024), __aligned__(1024))); /// This is internal intrinsic. C/C++ user should avoid calling it directly. static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 diff --git a/clang/lib/Headers/amxtransposeintrin.h b/clang/lib/Headers/amxtransposeintrin.h new file mode 100644 index 0000000000000..d5dc68f415284 --- /dev/null +++ b/clang/lib/Headers/amxtransposeintrin.h @@ -0,0 +1,248 @@ +/* ===--- amxtransposeintrin.h - AMX_TRANSPOSE intrinsics -*- C++ -*---------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + * ===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; use instead." +#endif /* __IMMINTRIN_H */ + +#ifndef __AMX_TRANSPOSEINTRIN_H +#define __AMX_TRANSPOSEINTRIN_H +#ifdef __x86_64__ + +#define __DEFAULT_FN_ATTRS_TRANSPOSE \ + __attribute__((__always_inline__, __nodebug__, __target__("amx-transpose"))) + +#define _tile_2rpntlvwz0(tdst, base, stride) \ + __builtin_ia32_t2rpntlvwz0(tdst, base, stride) +#define _tile_2rpntlvwz0t1(tdst, base, stride) \ + __builtin_ia32_t2rpntlvwz0t1(tdst, base, stride) +#define _tile_2rpntlvwz1(tdst, base, stride) \ + __builtin_ia32_t2rpntlvwz1(tdst, base, stride) +#define _tile_2rpntlvwz1t1(tdst, base, stride) \ + __builtin_ia32_t2rpntlvwz1t1(tdst, base, stride) + +/// Transpose 32-bit elements from \a src and write the result to \a dst. +/// +/// \headerfile +/// +/// \code +/// void __tile_transposed(__tile dst, __tile src); +/// \endcode +/// +/// This intrinsic corresponds to the TTRANSPOSED instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src +/// The 1st source tile. Max size is 1024 Bytes. +/// +/// \code{.operation} +/// +/// FOR i := 0 TO (dst.rows-1) +/// tmp[511:0] := 0 +/// FOR j := 0 TO (dst.colsb/4-1) +/// tmp.dword[j] := src.row[j].dword[i] +/// ENDFOR +/// dst.row[i] := tmp +/// ENDFOR +/// +/// zero_upper_rows(dst, dst.rows) +/// zero_tileconfig_start() +/// \endcode +#define _tile_transposed(dst, src) __builtin_ia32_ttransposed(dst, src) + +static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0_internal( + unsigned short row, unsigned short col0, unsigned short col1, + _tile1024i *dst0, _tile1024i *dst1, const void *base, + __SIZE_TYPE__ stride) { + // Use __tile1024i_1024a* to escape the alignment check in + // clang/test/Headers/x86-intrinsics-headers-clean.cpp + __builtin_ia32_t2rpntlvwz0_internal(row, col0, col1, (_tile1024i_1024a *)dst0, + (_tile1024i_1024a *)dst1, base, + (__SIZE_TYPE__)(stride)); +} + +static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0t1_internal( + unsigned short row, unsigned short col0, unsigned short col1, + _tile1024i *dst0, _tile1024i *dst1, const void *base, + __SIZE_TYPE__ stride) { + __builtin_ia32_t2rpntlvwz0t1_internal( + row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, + (__SIZE_TYPE__)(stride)); +} + +static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1_internal( + unsigned short row, unsigned short col0, unsigned short col1, + _tile1024i *dst0, _tile1024i *dst1, const void *base, + __SIZE_TYPE__ stride) { + __builtin_ia32_t2rpntlvwz1_internal(row, col0, col1, (_tile1024i_1024a *)dst0, + (_tile1024i_1024a *)dst1, base, + (__SIZE_TYPE__)(stride)); +} + +static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1t1_internal( + unsigned short row, unsigned short col0, unsigned short col1, + _tile1024i *dst0, _tile1024i *dst1, const void *base, + __SIZE_TYPE__ stride) { + __builtin_ia32_t2rpntlvwz1t1_internal( + row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, + (__SIZE_TYPE__)(stride)); +} + +// This is internal intrinsic. C/C++ user should avoid calling it directly. +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TRANSPOSE +_tile_transposed_internal(unsigned short m, unsigned short n, _tile1024i src) { + return __builtin_ia32_ttransposed_internal(m, n, src); +} + +/// Converts a pair of tiles from memory into VNNI format, and places the +/// results in a pair of destinations specified by dst. The pair of tiles +/// in memory is specified via a tsib; the second tile is after the first +/// one, separated by the same stride that separates each row. +/// The tile configuration for the destination tiles indicates the amount +/// of data to read from memory. The instruction will load a number of rows +/// that is equal to twice the number of rows in tmm1. The size of each row +/// is equal to the average width of the destination tiles. If the second +/// tile is configured with zero rows and columns, only the first tile will +/// be written. +/// Provides a hint to the implementation that the data will likely not be +/// reused in the near future and the data caching can be optimized. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the T2RPNTLVWZ0 instruction. +/// +/// \param dst0 +/// First tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param dst1 +/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS_TRANSPOSE +static void __tile_2rpntlvwz0(__tile1024i *dst0, __tile1024i *dst1, + const void *base, __SIZE_TYPE__ stride) { + _tile_2rpntlvwz0_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, + &dst1->tile, base, stride); +} + +/// Converts a pair of tiles from memory into VNNI format, and places the +/// results in a pair of destinations specified by dst. The pair of tiles +/// in memory is specified via a tsib; the second tile is after the first +/// one, separated by the same stride that separates each row. +/// The tile configuration for the destination tiles indicates the amount +/// of data to read from memory. The instruction will load a number of rows +/// that is equal to twice the number of rows in tmm1. The size of each row +/// is equal to the average width of the destination tiles. If the second +/// tile is configured with zero rows and columns, only the first tile will +/// be written. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the T2RPNTLVWZ0T1 instruction. +/// +/// \param dst0 +/// First tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param dst1 +/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS_TRANSPOSE +static void __tile_2rpntlvwz0t1(__tile1024i *dst0, __tile1024i *dst1, + const void *base, __SIZE_TYPE__ stride) { + _tile_2rpntlvwz0t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, + &dst1->tile, base, stride); +} + +/// Converts a pair of tiles from memory into VNNI format, and places the +/// results in a pair of destinations specified by dst. The pair of tiles +/// in memory is specified via a tsib; the second tile is after the first +/// one, separated by the same stride that separates each row. +/// The tile configuration for the destination tiles indicates the amount +/// of data to read from memory. The instruction will load a number of rows +/// that is equal to twice the number of rows in tmm1. The size of each row +/// is equal to the average width of the destination tiles. If the second +/// tile is configured with zero rows and columns, only the first tile will +/// be written. The last row will be not be read from memory but instead +/// filled with zeros. +/// Provides a hint to the implementation that the data will likely not be +/// reused in the near future and the data caching can be optimized. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the T2RPNTLVWZ1 instruction. +/// +/// \param dst0 +/// First tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param dst1 +/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS_TRANSPOSE +static void __tile_2rpntlvwz1(__tile1024i *dst0, __tile1024i *dst1, + const void *base, __SIZE_TYPE__ stride) { + _tile_2rpntlvwz1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, + &dst1->tile, base, stride); +} + +/// Converts a pair of tiles from memory into VNNI format, and places the +/// results in a pair of destinations specified by dst. The pair of tiles +/// in memory is specified via a tsib; the second tile is after the first +/// one, separated by the same stride that separates each row. +/// The tile configuration for the destination tiles indicates the amount +/// of data to read from memory. The instruction will load a number of rows +/// that is equal to twice the number of rows in tmm1. The size of each row +/// is equal to the average width of the destination tiles. If the second +/// tile is configured with zero rows and columns, only the first tile will +/// be written. The last row will be not be read from memory but instead +/// filled with zeros. +/// Provides a hint to the implementation that the data will likely not be +/// reused in the near future and the data caching can be optimized. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the T2RPNTLVWZ1T1 instruction. +/// +/// \param dst0 +/// First tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param dst1 +/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS_TRANSPOSE +static void __tile_2rpntlvwz1t1(__tile1024i *dst0, __tile1024i *dst1, + const void *base, __SIZE_TYPE__ stride) { + _tile_2rpntlvwz1t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, + &dst1->tile, base, stride); +} + +/// Transpose 32-bit elements from src and write the result to dst. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TTRANSPOSED instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src +/// The 1st source tile. Max size is 1024 Bytes. +__DEFAULT_FN_ATTRS_TRANSPOSE +static void __tile_transposed(__tile1024i *dst, __tile1024i src) { + dst->tile = _tile_transposed_internal(dst->row, dst->col, src.tile); +} + +#endif /* __x86_64__ */ +#endif /* __AMX_TRANSPOSEINTRIN_H */ diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h index 3fbabffa98df2..b6c13a6c8ec78 100644 --- a/clang/lib/Headers/immintrin.h +++ b/clang/lib/Headers/immintrin.h @@ -638,6 +638,10 @@ _storebe_i64(void * __P, long long __D) { #include #endif +#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_TRANSPOSE__) +#include +#endif + #if !defined(__SCE__) || __has_feature(modules) || \ defined(__AVX512VP2INTERSECT__) #include diff --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp index 6a4d78f0ca908..9e250ac8f906c 100644 --- a/clang/lib/Sema/SemaX86.cpp +++ b/clang/lib/Sema/SemaX86.cpp @@ -631,6 +631,10 @@ bool SemaX86::CheckBuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) { case X86::BI__builtin_ia32_tileloaddt164: case X86::BI__builtin_ia32_tilestored64: case X86::BI__builtin_ia32_tilezero: + case X86::BI__builtin_ia32_t2rpntlvwz0: + case X86::BI__builtin_ia32_t2rpntlvwz0t1: + case X86::BI__builtin_ia32_t2rpntlvwz1: + case X86::BI__builtin_ia32_t2rpntlvwz1t1: return CheckBuiltinTileArgumentsRange(TheCall, 0); case X86::BI__builtin_ia32_tdpbssd: case X86::BI__builtin_ia32_tdpbsud: @@ -641,6 +645,8 @@ bool SemaX86::CheckBuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) { case X86::BI__builtin_ia32_tcmmimfp16ps: case X86::BI__builtin_ia32_tcmmrlfp16ps: return CheckBuiltinTileRangeAndDuplicate(TheCall, {0, 1, 2}); + case X86::BI__builtin_ia32_ttransposed: + return CheckBuiltinTileArgumentsRange(TheCall, {0, 1}); } } static bool isX86_32Builtin(unsigned BuiltinID) { diff --git a/clang/test/CodeGen/X86/amx_transpose.c b/clang/test/CodeGen/X86/amx_transpose.c new file mode 100644 index 0000000000000..deefc592c7ae6 --- /dev/null +++ b/clang/test/CodeGen/X86/amx_transpose.c @@ -0,0 +1,36 @@ +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +amx-transpose \ +// RUN: -target-feature +avx512f -emit-llvm -o - -Wall -Werror -pedantic -Wno-gnu-statement-expression| FileCheck %s + +#include +#include + +void test_tile_2rpntlvwz0(const void *A, size_t B) { + // CHECK-LABEL: @test_tile_2rpntlvwz0 + // CHECK: call void @llvm.x86.t2rpntlvwz0(i8 1, ptr %{{.*}}, i64 %{{.*}}) + _tile_2rpntlvwz0(1, A, B); +} + +void test_tile_2rpntlvwz0t1(const void *A, size_t B) { + // CHECK-LABEL: @test_tile_2rpntlvwz0t1 + // CHECK: call void @llvm.x86.t2rpntlvwz0t1(i8 1, ptr %{{.*}}, i64 %{{.*}}) + _tile_2rpntlvwz0t1(1, A, B); +} + +void test_tile_2rpntlvwz1(const void *A, size_t B) { + // CHECK-LABEL: @test_tile_2rpntlvwz1 + // CHECK: call void @llvm.x86.t2rpntlvwz1(i8 1, ptr %{{.*}}, i64 %{{.*}}) + _tile_2rpntlvwz1(1, A, B); +} + +void test_tile_2rpntlvwz1t1(const void *A, size_t B) { + // CHECK-LABEL: @test_tile_2rpntlvwz1t1 + // CHECK: call void @llvm.x86.t2rpntlvwz1t1(i8 1, ptr %{{.*}}, i64 %{{.*}}) + _tile_2rpntlvwz1t1(1, A, B); +} + +void test_tile_transposed(void) +{ + // CHECK-LABEL: @test_tile_transposed + // CHECK: call void @llvm.x86.ttransposed(i8 1, i8 2) + _tile_transposed(1, 2); +} diff --git a/clang/test/CodeGen/X86/amx_transpose_api.c b/clang/test/CodeGen/X86/amx_transpose_api.c new file mode 100644 index 0000000000000..10310c2332b7a --- /dev/null +++ b/clang/test/CodeGen/X86/amx_transpose_api.c @@ -0,0 +1,66 @@ +// RUN: %clang_cc1 %s -flax-vector-conversions=none -ffreestanding -triple=x86_64-unknown-unknown -target-feature +avx512f \ +// RUN: -target-feature +amx-transpose \ +// RUN: -emit-llvm -o - -Werror -pedantic | FileCheck %s --check-prefixes=CHECK + +#include + +char buf[2048]; +#define STRIDE 32 + +char buf2[2048]; + +void test_tile_2rpntlvwz0(__tile1024i dst0, __tile1024i dst1) { + //CHECK-LABEL: @test_tile_2rpntlvwz0 + //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal + //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0 + //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} + //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1 + //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} + __tile_2rpntlvwz0(&dst0, &dst1, buf, STRIDE); +} + +void test_tile_2rpntlvwz0t1(__tile1024i dst0, __tile1024i dst1) { + //CHECK-LABEL: @test_tile_2rpntlvwz0t1 + //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal + //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0 + //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} + //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1 + //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} + __tile_2rpntlvwz0t1(&dst0, &dst1, buf, STRIDE); +} + +void test_tile_2rpntlvwz1(__tile1024i dst0, __tile1024i dst1) { + //CHECK-LABEL: @test_tile_2rpntlvwz1 + //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal + //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0 + //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} + //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1 + //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} + __tile_2rpntlvwz1(&dst0, &dst1, buf, STRIDE); +} + +void test_tile_2rpntlvwz1t1(__tile1024i dst0, __tile1024i dst1) { + //CHECK-LABEL: @test_tile_2rpntlvwz1t1 + //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal + //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0 + //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} + //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1 + //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} + __tile_2rpntlvwz1t1(&dst0, &dst1, buf, STRIDE); +} + +void test_tile_transposed(__tile1024i dst, __tile1024i src) { + //CHECK-LABEL: @test_tile_transposed + //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}}) + //CHECK-DAG: call x86_amx @llvm.x86.ttransposed.internal + //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + __tile_transposed(&dst, src); +} diff --git a/clang/test/CodeGen/X86/amx_transpose_errors.c b/clang/test/CodeGen/X86/amx_transpose_errors.c new file mode 100644 index 0000000000000..80084c42a240d --- /dev/null +++ b/clang/test/CodeGen/X86/amx_transpose_errors.c @@ -0,0 +1,31 @@ +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \ +// RUN: -target-feature +amx-int8 -target-feature +amx-bf16 -target-feature +amx-transpose \ +// RUN: -target-feature +avx512f -target-feature +amx-element-evex -verify + +#include +#include +#include +#include + +// Transpose +void test_tile_2rpntlvwz0(const void *A, size_t B) { + _tile_2rpntlvwz0(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} + +void test_tile_2rpntlvwz0t1(const void *A, size_t B) { + _tile_2rpntlvwz0t1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} + +void test_tile_2rpntlvwz1(const void *A, size_t B) { + _tile_2rpntlvwz1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} + +void test_tile_2rpntlvwz1t1(const void *A, size_t B) { + _tile_2rpntlvwz1t1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} + +void test_tile_transposed() +{ + _tile_transposed(8, 2); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + _tile_transposed(1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c index ddfbb29a48f8d..d203a81e6d1e9 100644 --- a/clang/test/Driver/x86-target-features.c +++ b/clang/test/Driver/x86-target-features.c @@ -304,6 +304,13 @@ // AMX-COMPLEX: "-target-feature" "+amx-complex" // NO-AMX-COMPLEX: "-target-feature" "-amx-complex" +// RUN: %clang -target x86_64-unknown-linux-gnu -mamx-transpose %s \ +// RUN: -### -o %t.o 2>&1 | FileCheck -check-prefix=AMX-TRANSPOSE %s +// RUN: %clang -target x86_64-unknown-linux-gnu -mno-amx-transpose %s \ +// RUN: -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-AMX-TRANSPOSE %s +// AMX-TRANSPOSE: "-target-feature" "+amx-transpose" +// NO-AMX-TRANSPOSE: "-target-feature" "-amx-transpose" + // RUN: %clang --target=i386 -march=i386 -mhreset %s -### 2>&1 | FileCheck -check-prefix=HRESET %s // RUN: %clang --target=i386 -march=i386 -mno-hreset %s -### 2>&1 | FileCheck -check-prefix=NO-HRESET %s // HRESET: "-target-feature" "+hreset" diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c index 8b4e6bdc09226..7d16f3fa240bc 100644 --- a/clang/test/Preprocessor/x86_target_features.c +++ b/clang/test/Preprocessor/x86_target_features.c @@ -546,6 +546,18 @@ // NO-AMX-COMPLEX-NOT: #define __AMX_COMPLEX__ 1 +// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mamx-transpose -x c \ +// RUN: -E -dM -o - %s | FileCheck -check-prefix=AMX-TRANSPOSE %s + +// AMX-TRANSPOSE: #define __AMX_TRANSPOSE__ 1 + +// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mno-amx-transpose -x c \ +// RUN: -E -dM -o - %s | FileCheck -check-prefix=NO-AMX-TRANSPOSE %s +// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mamx-transpose -mno-amx-tile \ +// RUN: -x c -E -dM -o - %s | FileCheck -check-prefix=NO-AMX-TRANSPOSE %s + +// NO-AMX-TRANSPOSE-NOT: #define __AMX_TRANSPOSE__ 1 + // RUN: %clang -target i386-unknown-unknown -march=atom -mavxvnni -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVXVNNI %s // AVXVNNI: #define __AVX2__ 1 diff --git a/llvm/include/llvm/CodeGen/TileShapeInfo.h b/llvm/include/llvm/CodeGen/TileShapeInfo.h index d00fe5c5535f5..0e0a883b0c595 100644 --- a/llvm/include/llvm/CodeGen/TileShapeInfo.h +++ b/llvm/include/llvm/CodeGen/TileShapeInfo.h @@ -34,9 +34,31 @@ class ShapeT { if (MRI) deduceImm(MRI); } + // When ShapeT has mult shapes, we only use Shapes (never use Row and Col) + // and ImmShapes. Due to the most case is only one shape (just simply use + // Shape.Row or Shape.Col), so here we don't merge Row and Col into vertor + // Shapes to keep the speed and code simplicity. + // TODO: The upper solution is a temporary way to minimize current tile + // register allocation code changes. It can not handle both Reg shape and + // Imm shape for different shapes (e.g. shape 1 is reg shape while shape 2 + // is imm shape). Refine me when we have more mult-tile shape instructions! + ShapeT(ArrayRef ShapesOperands, + const MachineRegisterInfo *MRI = nullptr) + : Row(nullptr), Col(nullptr), RowImm(InvalidImmShape), + ColImm(InvalidImmShape) { + assert(ShapesOperands.size() % 2 == 0 && "Miss row or col!"); + + for (auto *Shape : ShapesOperands) + Shapes.push_back(Shape); + + if (MRI) + deduceImm(MRI); + } ShapeT() : Row(nullptr), Col(nullptr), RowImm(InvalidImmShape), ColImm(InvalidImmShape) {} + // TODO: We need to extern cmp operator for muti-shapes if + // we have requirement in the future. bool operator==(const ShapeT &Shape) const { MachineOperand *R = Shape.Row; MachineOperand *C = Shape.Col; @@ -53,13 +75,40 @@ class ShapeT { bool operator!=(const ShapeT &Shape) const { return !(*this == Shape); } - MachineOperand *getRow() const { return Row; } + MachineOperand *getRow(unsigned I = 0) const { + if (Shapes.empty()) + return Row; + assert(Shapes.size() / 2 >= I && "Get invalid row from id!"); + return Shapes[I * 2]; + } - MachineOperand *getCol() const { return Col; } + MachineOperand *getCol(unsigned I = 0) const { + if (Shapes.empty()) + return Col; + assert(Shapes.size() / 2 >= I && "Get invalid col from id!"); + return Shapes[I * 2 + 1]; + } - int64_t getRowImm() const { return RowImm; } + int64_t getRowImm(unsigned I = 0) const { + if (ImmShapes.empty()) + return RowImm; + assert(ImmShapes.size() / 2 >= I && "Get invalid imm row from id!"); + return ImmShapes[I * 2]; + } - int64_t getColImm() const { return ColImm; } + int64_t getColImm(unsigned I = 0) const { + if (ImmShapes.empty()) + return ColImm; + assert(ImmShapes.size() / 2 >= I && "Get invalid imm col from id!"); + return ImmShapes[I * 2 + 1]; + } + + unsigned getShapeNum() { + if (Shapes.empty()) + return isValid() ? 1 : 0; + else + return Shapes.size() / 2; + } bool isValid() { return (Row != nullptr) && (Col != nullptr); } @@ -72,14 +121,35 @@ class ShapeT { for (const MachineOperand &DefMO : MRI->def_operands(Reg)) { const auto *MI = DefMO.getParent(); if (MI->isMoveImmediate()) { - Imm = MI->getOperand(1).getImm(); + assert(MI->getNumOperands() == 2 && + "Unsupported number of operands in instruction for setting " + "row/column."); + if (MI->getOperand(1).isImm()) { + Imm = MI->getOperand(1).getImm(); + } else { + assert(MI->getOperand(1).isImplicit() && + "Operand 1 is assumed to be implicit."); + Imm = 0; + } break; } } return Imm; }; - RowImm = GetImm(Row->getReg()); - ColImm = GetImm(Col->getReg()); + if (Shapes.empty()) { // Single Shape + RowImm = GetImm(Row->getReg()); + ColImm = GetImm(Col->getReg()); + // The number of rows of 2nd destination buffer is assigned by the one of + // 1st destination buffer. If the column size is equal to zero, the row + // size should be reset to zero too. + if (ColImm == 0) + Row = Col; + } else { // Multiple Shapes + for (auto *Shape : Shapes) { + int64_t ImmShape = GetImm(Shape->getReg()); + ImmShapes.push_back(ImmShape); + } + } } private: @@ -88,6 +158,9 @@ class ShapeT { MachineOperand *Col; int64_t RowImm = -1; int64_t ColImm = -1; + // Multiple Shapes + SmallVector Shapes; + SmallVector ImmShapes; }; } // namespace llvm diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index 5262e3154ff72..0c27ac5d7cec7 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -5917,6 +5917,23 @@ let TargetPrefix = "x86" in { [ImmArg>, ImmArg>, ImmArg>]>; + // AMX-TRANSPOSE + def int_x86_t2rpntlvwz0 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz0">, + Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], + [ImmArg>]>; + def int_x86_t2rpntlvwz0t1 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz0t1">, + Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], + [ImmArg>]>; + def int_x86_t2rpntlvwz1 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz1">, + Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], + [ImmArg>]>; + def int_x86_t2rpntlvwz1t1 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz1t1">, + Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], + [ImmArg>]>; + def int_x86_ttransposed : ClangBuiltin<"__builtin_ia32_ttransposed">, + Intrinsic<[], [llvm_i8_ty, llvm_i8_ty], + [ImmArg>, ImmArg>]>; + // AMX - internal intrinsics def int_x86_ldtilecfg_internal : ClangBuiltin<"__builtin_ia32_tile_loadconfig_internal">, @@ -5994,6 +6011,26 @@ let TargetPrefix = "x86" in { [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_x86amx_ty, llvm_x86amx_ty, llvm_x86amx_ty], []>; + def int_x86_t2rpntlvwz0_internal : + Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], + []>; + def int_x86_t2rpntlvwz0t1_internal : + Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], + []>; + def int_x86_t2rpntlvwz1_internal : + Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], + []>; + def int_x86_t2rpntlvwz1t1_internal : + Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], + []>; + def int_x86_ttransposed_internal : + ClangBuiltin<"__builtin_ia32_ttransposed_internal">, + Intrinsic<[llvm_x86amx_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_x86amx_ty], []>; } //===----------------------------------------------------------------------===// diff --git a/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h b/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h index 5ec8a718d5a3e..1e07fbe64f7d3 100644 --- a/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h +++ b/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h @@ -511,6 +511,7 @@ enum OperandEncoding { ENCODINGS ENCODING_max }; ENUM_ENTRY(TYPE_VK, "mask register") \ ENUM_ENTRY(TYPE_VK_PAIR, "mask register pair") \ ENUM_ENTRY(TYPE_TMM, "tile") \ + ENUM_ENTRY(TYPE_TMM_PAIR, "tile pair") \ ENUM_ENTRY(TYPE_SEGMENTREG, "Segment register operand") \ ENUM_ENTRY(TYPE_DEBUGREG, "Debug register operand") \ ENUM_ENTRY(TYPE_CONTROLREG, "Control register operand") \ diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def index e5bf196559ba6..53de78e9789fb 100644 --- a/llvm/include/llvm/TargetParser/X86TargetParser.def +++ b/llvm/include/llvm/TargetParser/X86TargetParser.def @@ -262,6 +262,7 @@ X86_FEATURE_COMPAT(AVX10_1_512, "avx10.1-512", 37) X86_FEATURE_COMPAT(AVX10_2, "avx10.2-256", 0) X86_FEATURE_COMPAT(AVX10_2_512, "avx10.2-512", 0) X86_FEATURE (ZU, "zu") +X86_FEATURE (AMX_TRANSPOSE, "amx-transpose") // These features aren't really CPU features, but the frontend can set them. X86_FEATURE (RETPOLINE_EXTERNAL_THUNK, "retpoline-external-thunk") X86_FEATURE (RETPOLINE_INDIRECT_BRANCHES, "retpoline-indirect-branches") diff --git a/llvm/lib/Target/X86/AsmParser/X86Operand.h b/llvm/lib/Target/X86/AsmParser/X86Operand.h index 03c333b90108e..07a00af881afe 100644 --- a/llvm/lib/Target/X86/AsmParser/X86Operand.h +++ b/llvm/lib/Target/X86/AsmParser/X86Operand.h @@ -623,6 +623,37 @@ struct X86Operand final : public MCParsedAsmOperand { Inst.addOperand(MCOperand::createReg(Reg)); } + bool isTILEPair() const { + return Kind == Register && + X86MCRegisterClasses[X86::TILERegClassID].contains(getReg()); + } + + void addTILEPairOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + unsigned Reg = getReg(); + switch (Reg) { + default: + llvm_unreachable("Invalid tile register!"); + case X86::TMM0: + case X86::TMM1: + Reg = X86::TMM0_TMM1; + break; + case X86::TMM2: + case X86::TMM3: + Reg = X86::TMM2_TMM3; + break; + case X86::TMM4: + case X86::TMM5: + Reg = X86::TMM4_TMM5; + break; + case X86::TMM6: + case X86::TMM7: + Reg = X86::TMM6_TMM7; + break; + } + Inst.addOperand(MCOperand::createReg(Reg)); + } + void addMemOperands(MCInst &Inst, unsigned N) const { assert((N == 5) && "Invalid number of operands!"); if (getMemBaseReg()) diff --git a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp index ee1c8144f681e..f198234f1ca30 100644 --- a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -806,6 +806,10 @@ static int readModRM(struct InternalInstruction *insn) { if (index > 7) \ *valid = 0; \ return prefix##_TMM0 + index; \ + case TYPE_TMM_PAIR: \ + if (index > 7) \ + *valid = 0; \ + return prefix##_TMM0_TMM1 + (index / 2); \ case TYPE_VK: \ index &= 0xf; \ if (index > 7) \ @@ -2315,6 +2319,7 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand, case TYPE_YMM: case TYPE_ZMM: case TYPE_TMM: + case TYPE_TMM_PAIR: case TYPE_VK_PAIR: case TYPE_VK: case TYPE_DEBUGREG: diff --git a/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h index b0aa70be12d83..dc9af2caa77b1 100644 --- a/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h +++ b/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h @@ -535,6 +535,12 @@ namespace X86Disassembler { ENTRY(TMM6) \ ENTRY(TMM7) +#define REGS_TMM_PAIRS \ + ENTRY(TMM0_TMM1) \ + ENTRY(TMM2_TMM3) \ + ENTRY(TMM4_TMM5) \ + ENTRY(TMM6_TMM7) + #define ALL_EA_BASES \ EA_BASES_16BIT \ EA_BASES_32BIT \ @@ -559,6 +565,7 @@ namespace X86Disassembler { REGS_DEBUG \ REGS_CONTROL \ REGS_TMM \ + REGS_TMM_PAIRS \ ENTRY(RIP) /// All possible values of the base field for effective-address diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp index e7ba13215feb5..51b82321d679b 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp @@ -463,3 +463,22 @@ void X86InstPrinterCommon::printVKPair(const MCInst *MI, unsigned OpNo, } llvm_unreachable("Unknown mask pair register name"); } + +void X86InstPrinterCommon::printTILEPair(const MCInst *MI, unsigned OpNo, + raw_ostream &OS) { + switch (MI->getOperand(OpNo).getReg()) { + case X86::TMM0_TMM1: + printRegName(OS, X86::TMM0); + return; + case X86::TMM2_TMM3: + printRegName(OS, X86::TMM2); + return; + case X86::TMM4_TMM5: + printRegName(OS, X86::TMM4); + return; + case X86::TMM6_TMM7: + printRegName(OS, X86::TMM6); + return; + } + llvm_unreachable("Unknown mask pair register name"); +} diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h index 221102e17c653..2a7b750bd6752 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h +++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h @@ -38,6 +38,7 @@ class X86InstPrinterCommon : public MCInstPrinter { const MCSubtargetInfo &STI); void printOptionalSegReg(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printVKPair(const MCInst *MI, unsigned OpNo, raw_ostream &OS); + void printTILEPair(const MCInst *MI, unsigned OpNo, raw_ostream &OS); }; } // end namespace llvm diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index d57450d91ea2d..9a2ff76090f7d 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -270,6 +270,9 @@ def FeatureAMXFP16 : SubtargetFeature<"amx-fp16", "HasAMXFP16", "true", def FeatureAMXCOMPLEX : SubtargetFeature<"amx-complex", "HasAMXCOMPLEX", "true", "Support AMX-COMPLEX instructions", [FeatureAMXTILE]>; +def FeatureAMXTRANSPOSE : SubtargetFeature<"amx-transpose", "HasAMXTRANSPOSE", "true", + "Support AMX amx-transpose instructions", + [FeatureAMXTILE]>; def FeatureCMPCCXADD : SubtargetFeature<"cmpccxadd", "HasCMPCCXADD", "true", "Support CMPCCXADD instructions">; def FeatureRAOINT : SubtargetFeature<"raoint", "HasRAOINT", "true", diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp index f4c67f115c9f3..5584c08a98303 100644 --- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -568,6 +568,131 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, MI.setDesc(TII->get(Opc)); return true; } + // TILEPAIRLOAD is just for TILEPair spill, we don't have corresponding + // AMX instruction to support it. So, split it to 2 load instructions: + // "TILEPAIRLOAD TMM0:TMM1, Base, Scale, Index, Offset, Segment" --> + // "TILELOAD TMM0, Base, Scale, Index, Offset, Segment" + + // "TILELOAD TMM1, Base, Scale, Index, Offset + TMM_SIZE, Segment" + case X86::PTILEPAIRLOAD: { + int64_t Disp = MBBI->getOperand(1 + X86::AddrDisp).getImm(); + Register TReg = MBBI->getOperand(0).getReg(); + bool DstIsDead = MBBI->getOperand(0).isDead(); + Register TReg0 = TRI->getSubReg(TReg, X86::sub_t0); + Register TReg1 = TRI->getSubReg(TReg, X86::sub_t1); + unsigned TmmSize = TRI->getRegSizeInBits(X86::TILERegClass) / 8; + + MachineInstrBuilder MIBLo = + BuildMI(MBB, MBBI, DL, TII->get(X86::TILELOADD)) + .addReg(TReg0, RegState::Define | getDeadRegState(DstIsDead)); + MachineInstrBuilder MIBHi = + BuildMI(MBB, MBBI, DL, TII->get(X86::TILELOADD)) + .addReg(TReg1, RegState::Define | getDeadRegState(DstIsDead)); + + for (int i = 0; i < X86::AddrNumOperands; ++i) { + MIBLo.add(MBBI->getOperand(1 + i)); + if (i == X86::AddrDisp) + MIBHi.addImm(Disp + TmmSize); + else + MIBHi.add(MBBI->getOperand(1 + i)); + } + + // Make sure the first stride reg used in first tileload is alive. + MachineOperand &Stride = + MIBLo.getInstr()->getOperand(1 + X86::AddrIndexReg); + Stride.setIsKill(false); + + // Split the memory operand, adjusting the offset and size for the halves. + MachineMemOperand *OldMMO = MBBI->memoperands().front(); + MachineFunction *MF = MBB.getParent(); + MachineMemOperand *MMOLo = MF->getMachineMemOperand(OldMMO, 0, TmmSize); + MachineMemOperand *MMOHi = + MF->getMachineMemOperand(OldMMO, TmmSize, TmmSize); + + MIBLo.setMemRefs(MMOLo); + MIBHi.setMemRefs(MMOHi); + + // Delete the pseudo. + MBB.erase(MBBI); + return true; + } + // Smilar with TILEPAIRLOAD, TILEPAIRSTORE is just for TILEPair spill, no + // corresponding AMX instruction to support it. So, split it too: + // "TILEPAIRSTORE Base, Scale, Index, Offset, Segment, TMM0:TMM1" --> + // "TILESTORE Base, Scale, Index, Offset, Segment, TMM0" + + // "TILESTORE Base, Scale, Index, Offset + TMM_SIZE, Segment, TMM1" + case X86::PTILEPAIRSTORE: { + int64_t Disp = MBBI->getOperand(X86::AddrDisp).getImm(); + Register TReg = MBBI->getOperand(X86::AddrNumOperands).getReg(); + bool SrcIsKill = MBBI->getOperand(X86::AddrNumOperands).isKill(); + Register TReg0 = TRI->getSubReg(TReg, X86::sub_t0); + Register TReg1 = TRI->getSubReg(TReg, X86::sub_t1); + unsigned TmmSize = TRI->getRegSizeInBits(X86::TILERegClass) / 8; + + MachineInstrBuilder MIBLo = + BuildMI(MBB, MBBI, DL, TII->get(X86::TILESTORED)); + MachineInstrBuilder MIBHi = + BuildMI(MBB, MBBI, DL, TII->get(X86::TILESTORED)); + + for (int i = 0; i < X86::AddrNumOperands; ++i) { + MIBLo.add(MBBI->getOperand(i)); + if (i == X86::AddrDisp) + MIBHi.addImm(Disp + TmmSize); + else + MIBHi.add(MBBI->getOperand(i)); + } + MIBLo.addReg(TReg0, getKillRegState(SrcIsKill)); + MIBHi.addReg(TReg1, getKillRegState(SrcIsKill)); + + // Make sure the first stride reg used in first tilestore is alive. + MachineOperand &Stride = MIBLo.getInstr()->getOperand(X86::AddrIndexReg); + Stride.setIsKill(false); + + // Split the memory operand, adjusting the offset and size for the halves. + MachineMemOperand *OldMMO = MBBI->memoperands().front(); + MachineFunction *MF = MBB.getParent(); + MachineMemOperand *MMOLo = MF->getMachineMemOperand(OldMMO, 0, TmmSize); + MachineMemOperand *MMOHi = + MF->getMachineMemOperand(OldMMO, TmmSize, TmmSize); + + MIBLo.setMemRefs(MMOLo); + MIBHi.setMemRefs(MMOHi); + + // Delete the pseudo. + MBB.erase(MBBI); + return true; + } + case X86::PT2RPNTLVWZ0V: + case X86::PT2RPNTLVWZ0T1V: + case X86::PT2RPNTLVWZ1V: + case X86::PT2RPNTLVWZ1T1V: { + for (unsigned i = 3; i > 0; --i) + MI.removeOperand(i); + unsigned Opc; + switch (Opcode) { + case X86::PT2RPNTLVWZ0V: + Opc = X86::T2RPNTLVWZ0; + break; + case X86::PT2RPNTLVWZ0T1V: + Opc = X86::T2RPNTLVWZ0T1; + break; + case X86::PT2RPNTLVWZ1V: + Opc = X86::T2RPNTLVWZ1; + break; + case X86::PT2RPNTLVWZ1T1V: + Opc = X86::T2RPNTLVWZ1T1; + break; + default: + llvm_unreachable("Impossible Opcode!"); + } + MI.setDesc(TII->get(Opc)); + return true; + } + case X86::PTTRANSPOSEDV: { + for (int i = 2; i > 0; --i) + MI.removeOperand(i); + MI.setDesc(TII->get(X86::TTRANSPOSED)); + return true; + } case X86::PTCMMIMFP16PSV: case X86::PTCMMRLFP16PSV: case X86::PTDPBSSDV: diff --git a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp index d50a4d3b23ae2..fd0987db5e4f5 100644 --- a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp +++ b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp @@ -268,24 +268,36 @@ void X86FastPreTileConfig::reload(MachineBasicBlock::iterator UseMI, << printReg(TileReg, TRI) << '\n'); } +unsigned getTileDefNum(MachineRegisterInfo *MRI, Register Reg) { + if (Reg.isVirtual()) { + unsigned RegClassID = MRI->getRegClass(Reg)->getID(); + if (RegClassID == X86::TILERegClassID) + return 1; + if (RegClassID == X86::TILEPAIRRegClassID) + return 2; + } else { + if (Reg >= X86::TMM0 && Reg <= X86::TMM7) + return 1; + if (Reg >= X86::TMM0_TMM1 && Reg <= X86::TMM6_TMM7) + return 2; + } + return 0; +} + +static bool isTileRegister(MachineRegisterInfo *MRI, Register VirtReg) { + return getTileDefNum(MRI, VirtReg) > 0; +} + static bool isTileDef(MachineRegisterInfo *MRI, MachineInstr &MI) { // The instruction must have 3 operands: tile def, row, col. if (MI.isDebugInstr() || MI.getNumOperands() < 3 || !MI.isPseudo()) return false; MachineOperand &MO = MI.getOperand(0); - if (MO.isReg()) { - Register Reg = MO.getReg(); - // FIXME it may be used after Greedy RA and the physical - // register is not rewritten yet. - if (Reg.isVirtual() && - MRI->getRegClass(Reg)->getID() == X86::TILERegClassID) - return true; - if (Reg >= X86::TMM0 && Reg <= X86::TMM7) - return true; - } + if (!MO.isReg()) + return false; - return false; + return getTileDefNum(MRI, MO.getReg()) > 0; } static ShapeT getShape(MachineRegisterInfo *MRI, Register TileReg) { @@ -424,8 +436,7 @@ void X86FastPreTileConfig::convertPHI(MachineBasicBlock *MBB, static bool isTileRegDef(MachineRegisterInfo *MRI, MachineInstr &MI) { MachineOperand &MO = MI.getOperand(0); - if (MO.isReg() && MO.getReg().isVirtual() && - MRI->getRegClass(MO.getReg())->getID() == X86::TILERegClassID) + if (MO.isReg() && MO.getReg().isVirtual() && isTileRegister(MRI, MO.getReg())) return true; return false; } @@ -524,8 +535,7 @@ bool X86FastPreTileConfig::configBasicBlock(MachineBasicBlock &MBB) { if (!MO.isReg()) continue; Register Reg = MO.getReg(); - if (Reg.isVirtual() && - MRI->getRegClass(Reg)->getID() == X86::TILERegClassID) + if (Reg.isVirtual() && isTileRegister(MRI, Reg)) return true; } return false; @@ -617,6 +627,19 @@ bool X86FastPreTileConfig::configBasicBlock(MachineBasicBlock &MBB) { else if (dominates(MBB, LastShapeMI, ColMI)) LastShapeMI = ColMI; } + unsigned TileDefNum = getTileDefNum(MRI, MI.getOperand(0).getReg()); + if (TileDefNum > 1) { + for (unsigned I = 1; I < TileDefNum; I++) { + MachineOperand *ColxMO = &MI.getOperand(2 + I); + MachineInstr *ColxMI = MRI->getVRegDef(ColxMO->getReg()); + if (ColxMI->getParent() == &MBB) { + if (!LastShapeMI) + LastShapeMI = ColxMI; + else if (dominates(MBB, LastShapeMI, ColxMI)) + LastShapeMI = ColxMI; + } + } + } // If there is user live out of the tilecfg, spill it and reload in // before the user. Register TileReg = MI.getOperand(0).getReg(); diff --git a/llvm/lib/Target/X86/X86FastTileConfig.cpp b/llvm/lib/Target/X86/X86FastTileConfig.cpp index 70bc11228be6a..72264dd6a5c38 100644 --- a/llvm/lib/Target/X86/X86FastTileConfig.cpp +++ b/llvm/lib/Target/X86/X86FastTileConfig.cpp @@ -80,28 +80,41 @@ INITIALIZE_PASS_BEGIN(X86FastTileConfig, DEBUG_TYPE, INITIALIZE_PASS_END(X86FastTileConfig, DEBUG_TYPE, "Fast Tile Register Configure", false, false) -static bool isTileDef(MachineRegisterInfo *MRI, MachineInstr &MI) { +static unsigned getNumDefTiles(MachineRegisterInfo *MRI, MachineInstr &MI) { // There is no phi instruction after register allocation. assert(MI.isPHI() == false); // The instruction must have 3 operands: tile def, row, col. // It should be AMX pseudo instruction that have shape operand. if (MI.isDebugInstr() || MI.isCopy() || MI.getNumOperands() < 3 || !MI.isPseudo()) - return false; + return 0; MachineOperand &MO = MI.getOperand(0); if (MO.isReg()) { Register Reg = MO.getReg(); - // FIXME it may be used after Greedy RA and the physical + // FIXME: It may be used after Greedy RA and the physical // register is not rewritten yet. - if (Reg.isVirtual() && - MRI->getRegClass(Reg)->getID() == X86::TILERegClassID) - return true; + if (Reg.isVirtual()) { + if (MRI->getRegClass(Reg)->getID() == X86::TILERegClassID) + return 1; + if (MRI->getRegClass(Reg)->getID() == X86::TILEPAIRRegClassID) + return 2; + } if (Reg >= X86::TMM0 && Reg <= X86::TMM7) - return true; + return 1; + if (Reg >= X86::TMM0_TMM1 && Reg <= X86::TMM6_TMM7) + return 2; } - return false; + return 0; +} + +static unsigned getTMMIndex(Register Reg) { + if (Reg >= X86::TMM0 && Reg <= X86::TMM7) + return Reg - X86::TMM0; + if (Reg >= X86::TMM0_TMM1 && Reg <= X86::TMM6_TMM7) + return (Reg - X86::TMM0_TMM1) * 2; + llvm_unreachable("Invalid Tmm Reg!"); } // PreTileConfig should configure the tile registers based on basic @@ -110,14 +123,17 @@ bool X86FastTileConfig::configBasicBlock(MachineBasicBlock &MBB) { bool Change = false; SmallVector, 6> ShapeInfos; for (MachineInstr &MI : reverse(MBB)) { - if (!isTileDef(MRI, MI) && MI.getOpcode() != X86::PLDTILECFGV) + unsigned DefNum = getNumDefTiles(MRI, MI); + if (DefNum == 0 && MI.getOpcode() != X86::PLDTILECFGV) continue; // AMX instructions that define tile register. if (MI.getOpcode() != X86::PLDTILECFGV) { MachineOperand &Row = MI.getOperand(1); - MachineOperand &Col = MI.getOperand(2); - unsigned TMMIdx = MI.getOperand(0).getReg() - X86::TMM0; - ShapeInfos.push_back({TMMIdx, ShapeT(&Row, &Col)}); + unsigned TMMIdx = getTMMIndex(MI.getOperand(0).getReg()); + for (unsigned I = 0; I < DefNum; I++) { + MachineOperand &Col = MI.getOperand(2 + I); + ShapeInfos.push_back({TMMIdx + I, ShapeT(&Row, &Col)}); + } } else { // PLDTILECFGV // Rewrite the shape information to memory. Stack slot should have // been initialized to zero in pre config. diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 70e4c199190d6..aea86c280e2f9 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -323,6 +323,35 @@ namespace { Segment = CurDAG->getRegister(0, MVT::i16); } + // Utility function to determine whether it is AMX SDNode right after + // lowering but before ISEL. + bool isAMXSDNode(SDNode *N) const { + // Check if N is AMX SDNode: + // 1. check specific opcode since these carry MVT::Untyped instead of + // x86amx_type; + // 2. check result type; + // 3. check operand type; + switch (N->getOpcode()) { + default: + break; + case X86::PT2RPNTLVWZ0V: + case X86::PT2RPNTLVWZ0T1V: + case X86::PT2RPNTLVWZ1V: + case X86::PT2RPNTLVWZ1T1V: + return true; + } + for (unsigned Idx = 0, E = N->getNumValues(); Idx != E; ++Idx) { + if (N->getValueType(Idx) == MVT::x86amx) + return true; + } + for (unsigned Idx = 0, E = N->getNumOperands(); Idx != E; ++Idx) { + SDValue Op = N->getOperand(Idx); + if (Op.getValueType() == MVT::x86amx) + return true; + } + return false; + } + // Utility function to determine whether we should avoid selecting // immediate forms of instructions for better code size or not. // At a high level, we'd like to avoid such instructions when @@ -5278,6 +5307,47 @@ void X86DAGToDAGISel::Select(SDNode *Node) { ReplaceNode(Node, CNode); return; } + case Intrinsic::x86_t2rpntlvwz0: + case Intrinsic::x86_t2rpntlvwz0t1: + case Intrinsic::x86_t2rpntlvwz1: + case Intrinsic::x86_t2rpntlvwz1t1: { + if (!Subtarget->hasAMXTRANSPOSE()) + break; + auto *MFI = + CurDAG->getMachineFunction().getInfo(); + MFI->setAMXProgModel(AMXProgModelEnum::DirectReg); + unsigned Opc; + switch (IntNo) { + default: + llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::x86_t2rpntlvwz0: + Opc = X86::PT2RPNTLVWZ0; + break; + case Intrinsic::x86_t2rpntlvwz0t1: + Opc = X86::PT2RPNTLVWZ0T1; + break; + case Intrinsic::x86_t2rpntlvwz1: + Opc = X86::PT2RPNTLVWZ1; + break; + case Intrinsic::x86_t2rpntlvwz1t1: + Opc = X86::PT2RPNTLVWZ1T1; + break; + } + // FIXME: Match displacement and scale. + unsigned TIndex = Node->getConstantOperandVal(2); + SDValue TReg = getI8Imm(TIndex, dl); + SDValue Base = Node->getOperand(3); + SDValue Scale = getI8Imm(1, dl); + SDValue Index = Node->getOperand(4); + SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); + SDValue Segment = CurDAG->getRegister(0, MVT::i16); + SDValue Chain = Node->getOperand(0); + MachineSDNode *CNode; + SDValue Ops[] = {TReg, Base, Scale, Index, Disp, Segment, Chain}; + CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); + ReplaceNode(Node, CNode); + return; + } } break; } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 7f4dc12a20837..676ae6b876103 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -27281,6 +27281,53 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC, Operation.getValue(1)); } + case Intrinsic::x86_t2rpntlvwz0_internal: + case Intrinsic::x86_t2rpntlvwz0t1_internal: + case Intrinsic::x86_t2rpntlvwz1_internal: + case Intrinsic::x86_t2rpntlvwz1t1_internal: { + if (!Subtarget.hasAMXTILE()) + break; + auto *X86MFI = DAG.getMachineFunction().getInfo(); + X86MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA); + unsigned IntNo = Op.getConstantOperandVal(1); + unsigned Opc = 0; + switch (IntNo) { + default: + llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::x86_t2rpntlvwz0_internal: + Opc = X86::PT2RPNTLVWZ0V; + break; + case Intrinsic::x86_t2rpntlvwz0t1_internal: + Opc = X86::PT2RPNTLVWZ0T1V; + break; + case Intrinsic::x86_t2rpntlvwz1_internal: + Opc = X86::PT2RPNTLVWZ1V; + break; + case Intrinsic::x86_t2rpntlvwz1t1_internal: + Opc = X86::PT2RPNTLVWZ1T1V; + break; + } + + SDLoc DL(Op); + SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other); + + SDValue Ops[] = {Op.getOperand(2), // Row + Op.getOperand(3), // Col0 + Op.getOperand(4), // Col1 + Op.getOperand(5), // Base + DAG.getTargetConstant(1, DL, MVT::i8), // Scale + Op.getOperand(6), // Index + DAG.getTargetConstant(0, DL, MVT::i32), // Disp + DAG.getRegister(0, MVT::i16), // Segment + Op.getOperand(0)}; // Chain + + MachineSDNode *Res = DAG.getMachineNode(Opc, DL, VTs, Ops); + SDValue Res0 = DAG.getTargetExtractSubreg(X86::sub_t0, DL, MVT::x86amx, + SDValue(Res, 0)); + SDValue Res1 = DAG.getTargetExtractSubreg(X86::sub_t1, DL, MVT::x86amx, + SDValue(Res, 0)); + return DAG.getMergeValues({Res0, Res1, SDValue(Res, 1)}, DL); + } case Intrinsic::x86_atomic_bts_rm: case Intrinsic::x86_atomic_btc_rm: case Intrinsic::x86_atomic_btr_rm: { @@ -37029,6 +37076,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, assert (Imm < 8 && "Illegal tmm index"); return X86::TMM0 + Imm; }; + auto TMMImmToTMMPair = [](unsigned Imm) { + assert(Imm < 8 && "Illegal tmm pair index."); + return X86::TMM0_TMM1 + Imm / 2; + }; switch (MI.getOpcode()) { default: llvm_unreachable("Unexpected instr type to insert"); case X86::TLS_addr32: @@ -37503,6 +37554,49 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MI.eraseFromParent(); // The pseudo is gone now. return BB; } + case X86::PT2RPNTLVWZ0: + case X86::PT2RPNTLVWZ0T1: + case X86::PT2RPNTLVWZ1: + case X86::PT2RPNTLVWZ1T1: { + const DebugLoc &DL = MI.getDebugLoc(); + unsigned Opc; + switch (MI.getOpcode()) { + default: + llvm_unreachable("Unexpected instruction!"); + case X86::PT2RPNTLVWZ0: + Opc = X86::T2RPNTLVWZ0; + break; + case X86::PT2RPNTLVWZ0T1: + Opc = X86::T2RPNTLVWZ0T1; + break; + case X86::PT2RPNTLVWZ1: + Opc = X86::T2RPNTLVWZ1; + break; + case X86::PT2RPNTLVWZ1T1: + Opc = X86::T2RPNTLVWZ1T1; + break; + } + MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc)); + MIB.addReg(TMMImmToTMMPair(MI.getOperand(0).getImm()), RegState::Define); + + MIB.add(MI.getOperand(1)); // base + MIB.add(MI.getOperand(2)); // scale + MIB.add(MI.getOperand(3)); // index + MIB.add(MI.getOperand(4)); // displacement + MIB.add(MI.getOperand(5)); // segment + MI.eraseFromParent(); // The pseudo is gone now. + return BB; + } + case X86::PTTRANSPOSED: { + const DebugLoc &DL = MI.getDebugLoc(); + + MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(X86::TTRANSPOSED)); + MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define); + MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef); + + MI.eraseFromParent(); // The pseudo is gone now. + return BB; + } } } diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td index 99deacc811a17..1c437e8f03ffb 100644 --- a/llvm/lib/Target/X86/X86InstrAMX.td +++ b/llvm/lib/Target/X86/X86InstrAMX.td @@ -267,3 +267,66 @@ let Predicates = [HasAMXCOMPLEX, In64BitMode] in { } } // SchedRW = [WriteSystem] } + +let Predicates = [HasAMXTILE, In64BitMode], isPseudo = true, SchedRW = [WriteSystem] in { + let mayStore = 1 in + def PTILEPAIRSTORE : PseudoI<(outs), (ins opaquemem:$src1, TILEPair:$src2), []>; + let mayLoad = 1 in + def PTILEPAIRLOAD : PseudoI<(outs TILEPair:$dst), (ins opaquemem:$src), []>; +} + +let Predicates = [HasAMXTRANSPOSE, In64BitMode] in { + let SchedRW = [WriteSystem] in { + def T2RPNTLVWZ0 : I<0x6e, MRMSrcMemFSIB, (outs TILEPair:$dst), + (ins sibmem:$src), "t2rpntlvwz0\t{$src, $dst|$dst, $src}", + []>, VEX, WIG, T8,PS; + + def T2RPNTLVWZ0T1 : I<0x6f, MRMSrcMemFSIB, (outs TILEPair:$dst), + (ins sibmem:$src), "t2rpntlvwz0t1\t{$src, $dst|$dst, $src}", + []>, VEX, T8,PS; + + def T2RPNTLVWZ1 : I<0x6e, MRMSrcMemFSIB, (outs TILEPair:$dst), + (ins sibmem:$src), "t2rpntlvwz1\t{$src, $dst|$dst, $src}", + []>, VEX, T8,PD; + + def T2RPNTLVWZ1T1 : I<0x6f, MRMSrcMemFSIB, (outs TILEPair:$dst), + (ins sibmem:$src), "t2rpntlvwz1t1\t{$src, $dst|$dst, $src}", + []>, VEX, T8,PD; + + def TTRANSPOSED : I<0x5f, MRMSrcReg, (outs TILE:$dst), (ins TILE:$src), + "ttransposed\t{$src, $dst|$dst, $src}", []>, VEX, T8,XS; + let isPseudo = true in { + def PT2RPNTLVWZ0V : PseudoI<(outs TILEPair:$dst), + (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), + []>; + def PT2RPNTLVWZ0T1V : PseudoI<(outs TILEPair:$dst), + (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), + []>; + def PT2RPNTLVWZ1V : PseudoI<(outs TILEPair:$dst), + (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), + []>; + def PT2RPNTLVWZ1T1V : PseudoI<(outs TILEPair:$dst), + (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), + []>; + } + + def PTTRANSPOSEDV : PseudoI<(outs TILE:$dst), + (ins GR16:$src1, GR16:$src2, TILE:$src), + [(set TILE: $dst, + (int_x86_ttransposed_internal GR16:$src1, GR16:$src2, + TILE:$src))]>; + + let usesCustomInserter = 1 in { + def PT2RPNTLVWZ0 : PseudoI<(outs), (ins u8imm:$dst, + sibmem:$src1), []>; + def PT2RPNTLVWZ0T1 : PseudoI<(outs), (ins u8imm:$dst, + sibmem:$src1), []>; + def PT2RPNTLVWZ1 : PseudoI<(outs), (ins u8imm:$dst, + sibmem:$src1), []>; + def PT2RPNTLVWZ1T1 : PseudoI<(outs), (ins u8imm:$dst, + sibmem:$src1), []>; + def PTTRANSPOSED : PseudoI<(outs), (ins u8imm:$dst, u8imm:$src), + [(int_x86_ttransposed timm:$dst, timm:$src)]>; + } + } +} // HasAMXTILE, HasAMXTRANSPOSE diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 38ea1f35be2b9..9b002ebd3a93b 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -4538,6 +4538,11 @@ static unsigned getLoadStoreRegOpcode(Register Reg, return Load ? GET_EGPR_IF_ENABLED(X86::TILELOADD) : GET_EGPR_IF_ENABLED(X86::TILESTORED); #undef GET_EGPR_IF_ENABLED + case 2048: + assert(X86::TILEPAIRRegClass.hasSubClassEq(RC) && + "Unknown 2048-byte regclass"); + assert(STI.hasAMXTILE() && "Using 2048-bit register requires AMX-TILE"); + return Load ? X86::PTILEPAIRLOAD : X86::PTILEPAIRSTORE; } } @@ -4732,6 +4737,7 @@ static bool isAMXOpcode(unsigned Opc) { case X86::TILESTORED: case X86::TILELOADD_EVEX: case X86::TILESTORED_EVEX: + case X86::PTILEPAIRLOAD: return true; } } @@ -4744,7 +4750,8 @@ void X86InstrInfo::loadStoreTileReg(MachineBasicBlock &MBB, default: llvm_unreachable("Unexpected special opcode!"); case X86::TILESTORED: - case X86::TILESTORED_EVEX: { + case X86::TILESTORED_EVEX: + case X86::PTILEPAIRSTORE: { // tilestored %tmm, (%sp, %idx) MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo(); Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); @@ -4758,7 +4765,8 @@ void X86InstrInfo::loadStoreTileReg(MachineBasicBlock &MBB, break; } case X86::TILELOADD: - case X86::TILELOADD_EVEX: { + case X86::TILELOADD_EVEX: + case X86::PTILEPAIRLOAD: { // tileloadd (%sp, %idx), %tmm MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo(); Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); diff --git a/llvm/lib/Target/X86/X86InstrOperands.td b/llvm/lib/Target/X86/X86InstrOperands.td index f8f5cd83166e3..2102cb4b6b5b7 100644 --- a/llvm/lib/Target/X86/X86InstrOperands.td +++ b/llvm/lib/Target/X86/X86InstrOperands.td @@ -501,3 +501,10 @@ def VK8Pair : RegisterOperand { def VK16Pair : RegisterOperand { let ParserMatchClass = VK16PairAsmOperand; } + +let RenderMethod = "addTILEPairOperands" in + def TILEPairAsmOperand : AsmOperandClass { let Name = "TILEPair"; } + +def TILEPair : RegisterOperand { + let ParserMatchClass = TILEPairAsmOperand; +} diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td index a815ddc9714f0..14aff7f82710e 100644 --- a/llvm/lib/Target/X86/X86InstrPredicates.td +++ b/llvm/lib/Target/X86/X86InstrPredicates.td @@ -182,6 +182,7 @@ def HasAMXTILE : Predicate<"Subtarget->hasAMXTILE()">; def HasAMXBF16 : Predicate<"Subtarget->hasAMXBF16()">; def HasAMXINT8 : Predicate<"Subtarget->hasAMXINT8()">; def HasAMXCOMPLEX : Predicate<"Subtarget->hasAMXCOMPLEX()">; +def HasAMXTRANSPOSE : Predicate<"Subtarget->hasAMXTRANSPOSE()">; def HasUINTR : Predicate<"Subtarget->hasUINTR()">; def HasUSERMSR : Predicate<"Subtarget->hasUSERMSR()">; def HasCRC32 : Predicate<"Subtarget->hasCRC32()">; diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp index 919e1eb3e38e9..b9d811e91ab5a 100644 --- a/llvm/lib/Target/X86/X86LowerAMXType.cpp +++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp @@ -74,6 +74,22 @@ static bool isAMXCast(Instruction *II) { match(II, m_Intrinsic(m_Value())); } +// Some instructions may return more than one tiles. +// e.g: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal +static unsigned getNumDefTiles(IntrinsicInst *II) { + Type *Ty = II->getType(); + if (Ty->isX86_AMXTy()) + return 1; + + unsigned Num = 0; + for (unsigned i = 0; i < Ty->getNumContainedTypes(); i++) { + Type *STy = Ty->getContainedType(i); + if (STy->isX86_AMXTy()) + Num++; + } + return Num; +} + static bool isAMXIntrinsic(Value *I) { auto *II = dyn_cast(I); if (!II) @@ -82,7 +98,7 @@ static bool isAMXIntrinsic(Value *I) { return false; // Check if return type or parameter is x86_amx. If it is x86_amx // the intrinsic must be x86 amx intrinsics. - if (II->getType()->isX86_AMXTy()) + if (getNumDefTiles(II) > 0) return true; for (Value *V : II->args()) { if (V->getType()->isX86_AMXTy()) @@ -121,12 +137,96 @@ static Instruction *getFirstNonAllocaInTheEntryBlock(Function &F) { llvm_unreachable("No terminator in the entry block!"); } -static std::pair getShape(IntrinsicInst *II, unsigned OpNo) { +class ShapeCalculator { +private: + TargetMachine *TM = nullptr; + + // In AMX intrinsics we let Shape = {Row, Col}, but the + // RealCol = Col / ElementSize. We may use the RealCol + // as a new Row for other new created AMX intrinsics. + std::map Col2Row, Row2Col; + +public: + ShapeCalculator(TargetMachine *TargetM) : TM(TargetM) {} + std::pair getShape(IntrinsicInst *II, unsigned OpNo); + std::pair getShape(PHINode *Phi); + Value *getRowFromCol(Instruction *II, Value *V, unsigned Granularity); + Value *getColFromRow(Instruction *II, Value *V, unsigned Granularity); +}; + +Value *ShapeCalculator::getRowFromCol(Instruction *II, Value *V, + unsigned Granularity) { + if (Col2Row.count(V)) + return Col2Row[V]; + IRBuilder<> Builder(II); + Value *RealRow = nullptr; + if (isa(V)) + RealRow = + Builder.getInt16((cast(V)->getSExtValue()) / Granularity); + else if (isa(V)) { + // When it is not a const value and it is not a function argument, we + // create Row after the definition of V instead of + // before II. For example, II is %118, we try to getshape for %117: + // %117 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x + // i32> %115). + // %118 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 + // %104, i16 %105, i16 %106, x86_amx %110, x86_amx %114, x86_amx + // %117). + // If we create %row = udiv i16 %106, 4 before %118(aka. II), then its + // definition is after its user(new tileload for %117). + // So, the best choice is to create %row right after the definition of + // %106. + Builder.SetInsertPoint(cast(V)); + RealRow = Builder.CreateUDiv(V, Builder.getInt16(4)); + cast(RealRow)->moveAfter(cast(V)); + } else { + // When it is not a const value and it is a function argument, we create + // Row at the entry bb. + IRBuilder<> NewBuilder( + getFirstNonAllocaInTheEntryBlock(*II->getFunction())); + RealRow = NewBuilder.CreateUDiv(V, NewBuilder.getInt16(Granularity)); + } + Col2Row[V] = RealRow; + return RealRow; +} + +Value *ShapeCalculator::getColFromRow(Instruction *II, Value *V, + unsigned Granularity) { + if (Row2Col.count(V)) + return Row2Col[V]; + IRBuilder<> Builder(II); + Value *RealCol = nullptr; + if (isa(V)) + RealCol = + Builder.getInt16((cast(V)->getSExtValue()) * Granularity); + else if (isa(V)) { + Builder.SetInsertPoint(cast(V)); + RealCol = Builder.CreateNUWMul(V, Builder.getInt16(Granularity)); + cast(RealCol)->moveAfter(cast(V)); + } else { + // When it is not a const value and it is a function argument, we create + // Row at the entry bb. + IRBuilder<> NewBuilder( + getFirstNonAllocaInTheEntryBlock(*II->getFunction())); + RealCol = NewBuilder.CreateNUWMul(V, NewBuilder.getInt16(Granularity)); + } + Row2Col[V] = RealCol; + return RealCol; +} + +// TODO: Refine the row and col-in-bytes of tile to row and col of matrix. +std::pair ShapeCalculator::getShape(IntrinsicInst *II, + unsigned OpNo) { + (void)TM; IRBuilder<> Builder(II); Value *Row = nullptr, *Col = nullptr; switch (II->getIntrinsicID()) { default: llvm_unreachable("Expect amx intrinsics"); + case Intrinsic::x86_t2rpntlvwz0_internal: + case Intrinsic::x86_t2rpntlvwz0t1_internal: + case Intrinsic::x86_t2rpntlvwz1_internal: + case Intrinsic::x86_t2rpntlvwz1t1_internal: case Intrinsic::x86_tileloadd64_internal: case Intrinsic::x86_tileloaddt164_internal: case Intrinsic::x86_tilestored64_internal: { @@ -154,43 +254,24 @@ static std::pair getShape(IntrinsicInst *II, unsigned OpNo) { Col = II->getArgOperand(2); break; case 5: - if (isa(II->getArgOperand(2))) - Row = Builder.getInt16( - (cast(II->getOperand(2))->getSExtValue()) / 4); - else if (isa(II->getArgOperand(2))) { - // When it is not a const value and it is not a function argument, we - // create Row after the definition of II->getOperand(2) instead of - // before II. For example, II is %118, we try to getshape for %117: - // %117 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x - // i32> %115). - // %118 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 - // %104, i16 %105, i16 %106, x86_amx %110, x86_amx %114, x86_amx - // %117). - // If we create %row = udiv i16 %106, 4 before %118(aka. II), then its - // definition is after its user(new tileload for %117). - // So, the best choice is to create %row right after the definition of - // %106. - Builder.SetInsertPoint(cast(II->getOperand(2))); - Row = Builder.CreateUDiv(II->getOperand(2), Builder.getInt16(4)); - cast(Row)->moveAfter(cast(II->getOperand(2))); - } else { - // When it is not a const value and it is a function argument, we create - // Row at the entry bb. - IRBuilder<> NewBuilder( - getFirstNonAllocaInTheEntryBlock(*II->getFunction())); - Row = NewBuilder.CreateUDiv(II->getOperand(2), NewBuilder.getInt16(4)); - } + Row = getRowFromCol(II, II->getArgOperand(2), 4); Col = II->getArgOperand(1); break; } break; } + case Intrinsic::x86_ttransposed_internal: { + assert((OpNo == 2) && "Illegal Operand Number."); + Row = getRowFromCol(II, II->getArgOperand(1), 4); + Col = getColFromRow(II, II->getArgOperand(0), 4); + break; + } } return std::make_pair(Row, Col); } -static std::pair getShape(PHINode *Phi) { +std::pair ShapeCalculator::getShape(PHINode *Phi) { Use &U = *(Phi->use_begin()); unsigned OpNo = U.getOperandNo(); User *V = U.getUser(); @@ -223,14 +304,15 @@ static std::pair getShape(PHINode *Phi) { namespace { class X86LowerAMXType { Function &Func; + ShapeCalculator *SC; // In AMX intrinsics we let Shape = {Row, Col}, but the // RealCol = Col / ElementSize. We may use the RealCol // as a new Row for other new created AMX intrinsics. - std::map Col2Row; + std::map Col2Row, Row2Col; public: - X86LowerAMXType(Function &F) : Func(F) {} + X86LowerAMXType(Function &F, ShapeCalculator *ShapeC) : Func(F), SC(ShapeC) {} bool visit(); void combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast); void combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST); @@ -247,7 +329,7 @@ void X86LowerAMXType::combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast) { Use &U = *(Bitcast->use_begin()); unsigned OpNo = U.getOperandNo(); auto *II = cast(U.getUser()); - std::tie(Row, Col) = getShape(II, OpNo); + std::tie(Row, Col) = SC->getShape(II, OpNo); IRBuilder<> Builder(Bitcast); // Use the maximun column as stride. Value *Stride = Builder.getInt64(64); @@ -327,7 +409,7 @@ bool X86LowerAMXType::transformBitcast(BitCastInst *Bitcast) { Builder.CreateStore(Src, AllocaAddr); // TODO we can pick an constant operand for the shape. Value *Row = nullptr, *Col = nullptr; - std::tie(Row, Col) = getShape(II, OpNo); + std::tie(Row, Col) = SC->getShape(II, OpNo); std::array Args = {Row, Col, I8Ptr, Stride}; Value *NewInst = Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal, {}, Args); @@ -467,10 +549,18 @@ static Value *getAllocaPos(BasicBlock *BB) { static Instruction *createTileStore(Instruction *TileDef, Value *Ptr) { assert(TileDef->getType()->isX86_AMXTy() && "Not define tile!"); - auto *II = cast(TileDef); + auto *II = dyn_cast(TileDef); + unsigned Idx = 0; + // Extract tile from mult tiles' def. + if (auto *Extr = dyn_cast(TileDef)) { + assert(Extr->hasIndices() && "Tile extract miss index!"); + Idx = Extr->getIndices()[0]; + II = cast(Extr->getOperand(0)); + } + assert(II && "Not tile intrinsic!"); - Value *Row = II->getOperand(0); - Value *Col = II->getOperand(1); + Value *Row = II->getOperand(Idx); + Value *Col = II->getOperand(Idx + 1); BasicBlock *BB = TileDef->getParent(); BasicBlock::iterator Iter = TileDef->getIterator(); @@ -489,14 +579,20 @@ static void replaceWithTileLoad(Use &U, Value *Ptr, bool IsPHI = false) { // Get tile shape. IntrinsicInst *II = nullptr; + unsigned Idx = 0; if (IsPHI) { Value *PhiOp = cast(V)->getIncomingValue(0); II = cast(PhiOp); + } else if (auto *Extr = dyn_cast(V)) { + // Extract tile from mult tiles' def. + assert(Extr->hasIndices() && "Tile extract miss index!"); + Idx = Extr->getIndices()[0]; + II = cast(Extr->getOperand(0)); } else { II = cast(V); } - Value *Row = II->getOperand(0); - Value *Col = II->getOperand(1); + Value *Row = II->getOperand(Idx); + Value *Col = II->getOperand(Idx + 1); Instruction *UserI = cast(U.getUser()); IRBuilder<> Builder(UserI); @@ -707,10 +803,12 @@ namespace { class X86LowerAMXCast { Function &Func; + ShapeCalculator *SC; std::unique_ptr DT; public: - X86LowerAMXCast(Function &F) : Func(F), DT(nullptr) {} + X86LowerAMXCast(Function &F, ShapeCalculator *ShapeC) + : Func(F), SC(ShapeC), DT(nullptr) {} bool combineCastStore(IntrinsicInst *Cast, StoreInst *ST); bool combineLoadCast(IntrinsicInst *Cast, LoadInst *LD); bool combineLdSt(SmallVectorImpl &Casts); @@ -788,7 +886,7 @@ bool X86LowerAMXCast::optimizeAMXCastFromPhi( if (!isa(IncValue) && !IncConst->isZeroValue()) return false; Value *Row = nullptr, *Col = nullptr; - std::tie(Row, Col) = getShape(OldPN); + std::tie(Row, Col) = SC->getShape(OldPN); // TODO: If it is not constant the Row and Col must domoniate tilezero // that we are going to create. if (!Row || !Col || !isa(Row) || !isa(Col)) @@ -919,6 +1017,19 @@ bool X86LowerAMXCast::optimizeAMXCastFromPhi( return true; } +static Value *getShapeFromAMXIntrinsic(Value *Inst, unsigned ShapeIdx, + bool IsRow) { + if (!isAMXIntrinsic(Inst)) + return nullptr; + + auto *II = cast(Inst); + if (IsRow) + return II->getOperand(0); + + assert(ShapeIdx < 2 && "Currently 2 shapes in 1 instruction at most!"); + return II->getOperand(ShapeIdx + 1); +} + // %43 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %42) // store <256 x i32> %43, <256 x i32>* %p, align 64 // --> @@ -926,16 +1037,46 @@ bool X86LowerAMXCast::optimizeAMXCastFromPhi( // i64 64, x86_amx %42) bool X86LowerAMXCast::combineCastStore(IntrinsicInst *Cast, StoreInst *ST) { Value *Tile = Cast->getOperand(0); - // TODO: If it is cast intrinsic or phi node, we can propagate the - // shape information through def-use chain. - if (!isAMXIntrinsic(Tile)) + + assert(Tile->getType()->isX86_AMXTy() && "Not Tile Operand!"); + + // TODO: Specially handle the mult-use case. + if (Tile->getNumUses() != 1) return false; - auto *II = cast(Tile); - // Tile is output from AMX intrinsic. The first operand of the - // intrinsic is row, the second operand of the intrinsic is column. - Value *Row = II->getOperand(0); - Value *Col = II->getOperand(1); + + // We don't fetch shape from tilestore, we only get shape from tiledef, + // so we can set the max tile shape to tilestore for special cases. IRBuilder<> Builder(ST); + Value *Row = nullptr; + Value *Col = nullptr; + + if (isAMXIntrinsic(Tile)) { + auto *II = cast(Tile); + // Tile is output from AMX intrinsic. The first operand of the + // intrinsic is row, the second operand of the intrinsic is column. + Row = II->getOperand(0); + Col = II->getOperand(1); + } else { + // Now we supported mult-tiles value in structure, so we may get tile + // from extracting mult-tiles structure. + // For example: + // %6 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %1, + // i16 %2, i16 %3, i8* %4, i64 %5) + // %7 = extractvalue { x86_amx, x86_amx } %6, 0 + // %8 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %7) + // store <256 x i32> %8, <256 x i32>* %0, align 1024 + // + // TODO: Currently we only handle extractvalue case, enhance me for other + // cases if possible. + auto *II = cast(Tile); + assert(II && "We meet unhandle source in fetching tile value!"); + unsigned ShapeIdx = II->getIndices()[0]; + Value *Tiles = II->getOperand(0); + Row = getShapeFromAMXIntrinsic(Tiles, ShapeIdx, true); + Col = getShapeFromAMXIntrinsic(Tiles, ShapeIdx, false); + } + assert(Row && Col && "Shape got failed!"); + // Stride should be equal to col(measured by bytes) Value *Stride = Builder.CreateSExt(Col, Builder.getInt64Ty()); Value *I8Ptr = Builder.CreateBitCast(ST->getOperand(1), Builder.getPtrTy()); @@ -959,7 +1100,7 @@ bool X86LowerAMXCast::combineLoadCast(IntrinsicInst *Cast, LoadInst *LD) { // shape information through def-use chain. if (!isAMXIntrinsic(II)) return false; - std::tie(Row, Col) = getShape(II, OpNo); + std::tie(Row, Col) = SC->getShape(II, OpNo); IRBuilder<> Builder(LD); // Stride should be equal to col(measured by bytes) Value *Stride = Builder.CreateSExt(Col, Builder.getInt64Ty()); @@ -1169,7 +1310,7 @@ bool X86LowerAMXCast::transformAMXCast(IntrinsicInst *AMXCast) { Builder.CreateStore(Src, AllocaAddr); // TODO we can pick an constant operand for the shape. Value *Row = nullptr, *Col = nullptr; - std::tie(Row, Col) = getShape(II, OpNo); + std::tie(Row, Col) = SC->getShape(II, OpNo); std::array Args = { Row, Col, I8Ptr, Builder.CreateSExt(Col, Builder.getInt64Ty())}; Value *NewInst = @@ -1245,13 +1386,14 @@ class X86LowerAMXTypeLegacyPass : public FunctionPass { TargetLibraryInfo *TLI = &getAnalysis().getTLI(F); - X86LowerAMXCast LAC(F); + ShapeCalculator SC(TM); + X86LowerAMXCast LAC(F, &SC); C |= LAC.combineAMXcast(TLI); // There might be remaining AMXcast after combineAMXcast and they should be // handled elegantly. C |= LAC.transformAllAMXCast(); - X86LowerAMXType LAT(F); + X86LowerAMXType LAT(F, &SC); C |= LAT.visit(); // Prepare for fast register allocation at O0. diff --git a/llvm/lib/Target/X86/X86PreTileConfig.cpp b/llvm/lib/Target/X86/X86PreTileConfig.cpp index 1d1885a3dcd24..d20bfdcdb7f9c 100644 --- a/llvm/lib/Target/X86/X86PreTileConfig.cpp +++ b/llvm/lib/Target/X86/X86PreTileConfig.cpp @@ -118,16 +118,27 @@ class X86PreTileConfig : public MachineFunctionPass { bool isAMXInstruction(MachineInstr &MI) { if (MI.isPHI() || MI.isDebugInstr() || MI.getNumOperands() < 3) return false; - MachineOperand &MO = MI.getOperand(0); + + // PTILESTOREDV is the only exception that doesn't def a AMX register. + if (MI.getOpcode() == X86::PTILESTOREDV) + return true; + // We can simply check if it is AMX instruction by its def. // But we should exclude old API which uses physical registers. - if (MO.isReg() && MO.getReg().isVirtual() && - MRI->getRegClass(MO.getReg())->getID() == X86::TILERegClassID) { - collectShapeInfo(MI); - return true; - } - // PTILESTOREDV is the only exception that doesn't def a AMX register. - return MI.getOpcode() == X86::PTILESTOREDV; + MachineOperand &MO = MI.getOperand(0); + if (!MO.isReg() || !MO.getReg().isVirtual()) + return false; + + unsigned Shapes = 0; + if (MRI->getRegClass(MO.getReg())->getID() == X86::TILERegClassID) + Shapes = 1; + if (MRI->getRegClass(MO.getReg())->getID() == X86::TILEPAIRRegClassID) + Shapes = 2; + if (!Shapes) + return false; + + collectShapeInfo(MI, Shapes); + return true; } /// Check if it is an edge from loop bottom to loop head. @@ -142,7 +153,7 @@ class X86PreTileConfig : public MachineFunctionPass { } /// Collect the shape def information for later use. - void collectShapeInfo(MachineInstr &MI); + void collectShapeInfo(MachineInstr &MI, unsigned Shapes); /// Try to hoist shapes definded below AMX instructions. bool hoistShapesInBB(MachineBasicBlock *MBB, SmallVectorImpl &Shapes) { @@ -208,7 +219,7 @@ INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) INITIALIZE_PASS_END(X86PreTileConfig, "tilepreconfig", "Tile Register Pre-configure", false, false) -void X86PreTileConfig::collectShapeInfo(MachineInstr &MI) { +void X86PreTileConfig::collectShapeInfo(MachineInstr &MI, unsigned Shapes) { auto RecordShape = [&](MachineInstr *MI, MachineBasicBlock *MBB) { MIRef MIR(MI, MBB); auto I = llvm::lower_bound(ShapeBBs[MBB], MIR); @@ -216,8 +227,10 @@ void X86PreTileConfig::collectShapeInfo(MachineInstr &MI) { ShapeBBs[MBB].insert(I, MIR); }; - SmallVector WorkList( - {MI.getOperand(1).getReg(), MI.getOperand(2).getReg()}); + // All shapes have same row in multi-tile operand. + SmallVector WorkList; + for (unsigned I = 1; I < Shapes + 2; ++I) + WorkList.push_back(MI.getOperand(I).getReg()); while (!WorkList.empty()) { Register R = WorkList.pop_back_val(); MachineInstr *DefMI = MRI->getVRegDef(R); @@ -225,6 +238,14 @@ void X86PreTileConfig::collectShapeInfo(MachineInstr &MI) { MachineBasicBlock *DefMBB = DefMI->getParent(); if (DefMI->isMoveImmediate() || !DefVisited.insert(DefMI).second) continue; + + // This happens when column = 0 in multi-tile operand. + if (DefMI->getOpcode() == X86::COPY) { + MachineInstr *MI = MRI->getVRegDef(DefMI->getOperand(1).getReg()); + if (MI && MI->isMoveImmediate()) + continue; + } + if (DefMI->isPHI()) { for (unsigned I = 1; I < DefMI->getNumOperands(); I += 2) if (isLoopBackEdge(DefMBB, DefMI->getOperand(I + 1).getMBB())) diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp index 302d50581e1e6..2daaa95b06be0 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -642,6 +642,10 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(*AI); } + // Reserve low half pair registers in case they are used by RA aggressively. + Reserved.set(X86::TMM0_TMM1); + Reserved.set(X86::TMM2_TMM3); + assert(checkAllSuperRegsMarked(Reserved, {X86::SIL, X86::DIL, X86::BPL, X86::SPL, X86::SIH, X86::DIH, X86::BPH, X86::SPH})); @@ -662,7 +666,7 @@ unsigned X86RegisterInfo::getNumSupportedRegs(const MachineFunction &MF) const { // and try to return the minimum number of registers supported by the target. static_assert((X86::R15WH + 1 == X86::YMM0) && (X86::YMM15 + 1 == X86::K0) && (X86::K6_K7 + 1 == X86::TMMCFG) && - (X86::TMM7 + 1 == X86::R16) && + (X86::TMM6_TMM7 + 1 == X86::R16) && (X86::R31WH + 1 == X86::NUM_TARGET_REGS), "Register number may be incorrect"); @@ -735,7 +739,8 @@ bool X86RegisterInfo::isFixedRegister(const MachineFunction &MF, } bool X86RegisterInfo::isTileRegisterClass(const TargetRegisterClass *RC) const { - return RC->getID() == X86::TILERegClassID; + return RC->getID() == X86::TILERegClassID || + RC->getID() == X86::TILEPAIRRegClassID; } void X86RegisterInfo::adjustStackMapLiveOutMask(uint32_t *Mask) const { @@ -1073,12 +1078,59 @@ static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM, case X86::PTDPFP16PSV: case X86::PTCMMIMFP16PSV: case X86::PTCMMRLFP16PSV: + case X86::PTTRANSPOSEDV: { MachineOperand &MO1 = MI->getOperand(1); MachineOperand &MO2 = MI->getOperand(2); ShapeT Shape(&MO1, &MO2, MRI); VRM->assignVirt2Shape(VirtReg, Shape); return Shape; } + case X86::PT2RPNTLVWZ0V: + case X86::PT2RPNTLVWZ0T1V: + case X86::PT2RPNTLVWZ1V: + case X86::PT2RPNTLVWZ1T1V: { + MachineOperand &MO1 = MI->getOperand(1); + MachineOperand &MO2 = MI->getOperand(2); + MachineOperand &MO3 = MI->getOperand(3); + ShapeT Shape({&MO1, &MO2, &MO1, &MO3}, MRI); + VRM->assignVirt2Shape(VirtReg, Shape); + return Shape; + } + } +} + +static bool canHintShape(ShapeT &PhysShape, ShapeT &VirtShape) { + unsigned PhysShapeNum = PhysShape.getShapeNum(); + unsigned VirtShapeNum = VirtShape.getShapeNum(); + + if (PhysShapeNum < VirtShapeNum) + return false; + + if (PhysShapeNum == VirtShapeNum) { + if (PhysShapeNum == 1) + return PhysShape == VirtShape; + + for (unsigned I = 0; I < PhysShapeNum; I++) { + ShapeT PShape(PhysShape.getRow(I), PhysShape.getCol(I)); + ShapeT VShape(VirtShape.getRow(I), VirtShape.getCol(I)); + if (VShape != PShape) + return false; + } + return true; + } + + // Hint subreg of mult-tile reg to single tile reg. + if (VirtShapeNum == 1) { + for (unsigned I = 0; I < PhysShapeNum; I++) { + ShapeT PShape(PhysShape.getRow(I), PhysShape.getCol(I)); + if (VirtShape == PShape) + return true; + } + } + + // Note: Currently we have no requirement for case of + // (VirtShapeNum > 1 and PhysShapeNum > VirtShapeNum) + return false; } bool X86RegisterInfo::getRegAllocationHints(Register VirtReg, @@ -1099,7 +1151,7 @@ bool X86RegisterInfo::getRegAllocationHints(Register VirtReg, if (!VRM) return BaseImplRetVal; - if (ID != X86::TILERegClassID) { + if (ID != X86::TILERegClassID && ID != X86::TILEPAIRRegClassID) { if (DisableRegAllocNDDHints || !ST.hasNDD() || !TRI.isGeneralPurposeRegisterClass(&RC)) return BaseImplRetVal; @@ -1151,7 +1203,7 @@ bool X86RegisterInfo::getRegAllocationHints(Register VirtReg, return; } ShapeT PhysShape = getTileShape(VReg, const_cast(VRM), MRI); - if (PhysShape == VirtShape) + if (canHintShape(PhysShape, VirtShape)) Hints.push_back(PhysReg); }; diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td index 166024bf3b53f..19a0b37d06a2a 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.td +++ b/llvm/lib/Target/X86/X86RegisterInfo.td @@ -30,6 +30,8 @@ let Namespace = "X86" in { def sub_ymm : SubRegIndex<256>; def sub_mask_0 : SubRegIndex<-1>; def sub_mask_1 : SubRegIndex<-1, -1>; + def sub_t0 : SubRegIndex<8192>; + def sub_t1 : SubRegIndex<8192, 8192>; } //===----------------------------------------------------------------------===// @@ -431,6 +433,10 @@ def TMM5: X86Reg<"tmm5", 5>; def TMM6: X86Reg<"tmm6", 6>; def TMM7: X86Reg<"tmm7", 7>; } +// TMM register pairs +def TPAIRS : RegisterTuples<[sub_t0, sub_t1], + [(add TMM0, TMM2, TMM4, TMM6), + (add TMM1, TMM3, TMM5, TMM7)]>; // Floating point stack registers. These don't map one-to-one to the FP // pseudo registers, but we still mark them as aliasing FP registers. That @@ -835,6 +841,9 @@ def VK64WM : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;} let CopyCost = -1 in // Don't allow copying of tile registers def TILE : RegisterClass<"X86", [x86amx], 8192, (sequence "TMM%u", 0, 7)> {let Size = 8192;} +// Need check alignment 3rd operand size=1024*2*8 +let isAllocatable = 1 in +def TILEPAIR : RegisterClass<"X86", [untyped], 512, (add TPAIRS)> {let Size = 16384;} //===----------------------------------------------------------------------===// // Register categories. diff --git a/llvm/lib/Target/X86/X86TileConfig.cpp b/llvm/lib/Target/X86/X86TileConfig.cpp index 2250c3912a90d..95a84c2cda536 100644 --- a/llvm/lib/Target/X86/X86TileConfig.cpp +++ b/llvm/lib/Target/X86/X86TileConfig.cpp @@ -76,6 +76,63 @@ INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy) INITIALIZE_PASS_END(X86TileConfig, DEBUG_TYPE, "Tile Register Configure", false, false) +unsigned getAMXRegNum(MachineRegisterInfo *MRI, Register Reg) { + if (Reg.isVirtual()) { + unsigned RegClassID = MRI->getRegClass(Reg)->getID(); + if (RegClassID == X86::TILERegClassID) + return 1; + if (RegClassID == X86::TILEPAIRRegClassID) + return 2; + } else { + if (Reg >= X86::TMM0 && Reg <= X86::TMM7) + return 1; + if (Reg >= X86::TMM0_TMM1 && Reg <= X86::TMM6_TMM7) + return 2; + } + return 0; +} + +static void collectVirtRegShapes(MachineRegisterInfo *MRI, VirtRegMap &VRM, + Register VirtReg, + SmallVector &Phys2Shapes) { + unsigned Num = getAMXRegNum(MRI, VirtReg); + MCRegister PhysReg = VRM.getPhys(VirtReg); + if (!PhysReg) + return; + + if (Num == 1) { + unsigned Index = PhysReg - X86::TMM0; + if (!Phys2Shapes[Index].isValid()) { + ShapeT Shape = VRM.getShape(VirtReg); + Phys2Shapes[Index] = Shape; + return; + } + } + // Split tile pair shape info to 2 single tile shape info. e.g: + // Put TMM0_TMM1's Shape to TMM0's shape + TMM1's Shape in Phys2Shapes. + if (Num == 2) { + unsigned Index0 = (PhysReg - X86::TMM0_TMM1) * 2; + unsigned Index1 = (PhysReg - X86::TMM0_TMM1) * 2 + 1; + + ShapeT Shape = VRM.getShape(VirtReg); + assert(Shape.getShapeNum() == 2 && "Unexpected shape number!"); + + if (!Phys2Shapes[Index0].isValid()) { + ShapeT Shape0(Shape.getRow(0), Shape.getCol(0), MRI); + Phys2Shapes[Index0] = Shape0; + } + + if (!Phys2Shapes[Index1].isValid()) { + ShapeT Shape1(Shape.getRow(1), Shape.getCol(1), MRI); + Phys2Shapes[Index1] = Shape1; + } + } +} + +static bool isAMXRegClass(MachineRegisterInfo *MRI, Register Reg) { + return getAMXRegNum(MRI, Reg) > 0; +} + bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) { X86MachineFunctionInfo *X86FI = MF.getInfo(); // Early exit in the common case of non-AMX code. @@ -121,29 +178,24 @@ bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) { assert(ConstMI && "Cannot find an insertion point"); unsigned AMXRegNum = TRI->getRegClass(X86::TILERegClassID)->getNumRegs(); - SmallVector Phys2Virt(AMXRegNum, 0); + SmallVector Phys2Shapes(AMXRegNum, ShapeT()); for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { Register VirtReg = Register::index2VirtReg(I); if (MRI.reg_nodbg_empty(VirtReg)) continue; - if (MRI.getRegClass(VirtReg)->getID() != X86::TILERegClassID) - continue; - MCRegister PhysReg = VRM.getPhys(VirtReg); - if (!PhysReg) + if (!isAMXRegClass(&MRI, VirtReg)) continue; - unsigned Index = PhysReg - X86::TMM0; - if (!Phys2Virt[Index]) - Phys2Virt[Index] = VirtReg; + collectVirtRegShapes(&MRI, VRM, VirtReg, Phys2Shapes); } // Fill in the shape of each tile physical register. for (unsigned I = 0; I < AMXRegNum; ++I) { - if (!Phys2Virt[I]) + ShapeT Shape = Phys2Shapes[I]; + if (!Shape.isValid()) continue; DebugLoc DL; bool IsRow = true; MachineInstr *NewMI = nullptr; - ShapeT Shape = VRM.getShape(Phys2Virt[I]); for (auto &R : {Shape.getRow()->getReg(), Shape.getCol()->getReg()}) { // Here is the data format for the tile config. // 0 palette @@ -172,7 +224,15 @@ bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) { "Cannot initialize with different shapes"); continue; } - Imm = DefMI.getOperand(1).getImm(); + if (DefMI.getOperand(1).isImm()) { + Imm = DefMI.getOperand(1).getImm(); + } else { + assert(DefMI.getOpcode() == X86::MOV32r0 && + "The opcode is assumed to be MOV32r0 if the operand is not " + "immediate."); + Imm = 0; + } + NewMI = addFrameReference( BuildMI(MF.front(), ++ConstMI->getIterator(), DL, TII->get(IsRow ? X86::MOV8mi : X86::MOV16mi)), diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp index 1f608f47ef79f..d48b5029dc740 100644 --- a/llvm/lib/TargetParser/Host.cpp +++ b/llvm/lib/TargetParser/Host.cpp @@ -1875,6 +1875,10 @@ const StringMap sys::getHostCPUFeatures() { MaxLevel >= 0x19 && !getX86CpuIDAndInfo(0x19, &EAX, &EBX, &ECX, &EDX); Features["widekl"] = HasLeaf7 && HasLeaf19 && ((EBX >> 2) & 1); + bool HasLeaf1E = + MaxLevel >= 0x1e && !getX86CpuIDAndInfo(0x1e, &EAX, &EBX, &ECX, &EDX); + Features["amx-transpose"] = HasLeaf1E && ((EAX >> 5) & 1) && HasAMXSave; + bool HasLeaf24 = MaxLevel >= 0x24 && !getX86CpuIDAndInfo(0x24, &EAX, &EBX, &ECX, &EDX); diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp index 09d4312918acf..e022734eb69ff 100644 --- a/llvm/lib/TargetParser/X86TargetParser.cpp +++ b/llvm/lib/TargetParser/X86TargetParser.cpp @@ -598,6 +598,7 @@ constexpr FeatureBitset ImpliedFeaturesAMX_BF16 = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_FP16 = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_INT8 = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_COMPLEX = FeatureAMX_TILE; +constexpr FeatureBitset ImpliedFeaturesAMX_TRANSPOSE = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesHRESET = {}; constexpr FeatureBitset ImpliedFeaturesPREFETCHI = {}; diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll b/llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll new file mode 100644 index 0000000000000..4f41410010302 --- /dev/null +++ b/llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll @@ -0,0 +1,136 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \ +; RUN: -mattr=+amx-transpose -verify-machineinstrs | FileCheck %s + +@buf = dso_local global [2048 x i8] zeroinitializer, align 16 +@buf2 = dso_local global [2048 x i8] zeroinitializer, align 16 + +define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1) local_unnamed_addr #0 { +; CHECK-LABEL: test_tile_2rpntlvwz0: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: andq $-1024, %rsp # imm = 0xFC00 +; CHECK-NEXT: subq $8192, %rsp # imm = 0x2000 +; CHECK-NEXT: .cfi_offset %rbx, -24 +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) +; CHECK-NEXT: # kill: def $dx killed $dx killed $edx +; CHECK-NEXT: movw %si, %cx +; CHECK-NEXT: movw %di, %ax +; CHECK-NEXT: # implicit-def: $al +; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: # implicit-def: $al +; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: # implicit-def: $al +; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: # implicit-def: $cl +; CHECK-NEXT: movb %cl, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: # implicit-def: $al +; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: # implicit-def: $al +; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: # implicit-def: $al +; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: # implicit-def: $al +; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $buf, %esi +; CHECK-NEXT: movl $32, %edi +; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdi), %tmm4 +; CHECK-NEXT: movabsq $64, %rbx +; CHECK-NEXT: tilestored %tmm5, (%rsp,%rbx) # 1024-byte Folded Spill +; CHECK-NEXT: tileloadd (%rsp,%rbx), %tmm0 # 1024-byte Folded Reload +; CHECK-NEXT: movabsq $64, %rbx +; CHECK-NEXT: tilestored %tmm4, 1024(%rsp,%rbx) # 1024-byte Folded Spill +; CHECK-NEXT: tileloadd 1024(%rsp,%rbx), %tmm1 # 1024-byte Folded Reload +; CHECK-NEXT: movl $64, %edi +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: tilestored %tmm1, (%rsi,%rdi) +; CHECK-NEXT: movl $64, %edi +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: tilestored %tmm0, (%rsi,%rdi) +; CHECK-NEXT: tilezero %tmm0 +; CHECK-NEXT: movl $64, %edi +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: tilestored %tmm0, (%rsi,%rdi) +; CHECK-NEXT: movl $64, %edi +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm1 +; CHECK-NEXT: movl $64, %edi +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm2 +; CHECK-NEXT: movl $64, %edi +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm0 +; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 +; CHECK-NEXT: movl $64, %edi +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: tilestored %tmm0, (%rsi,%rdi) +; CHECK-NEXT: movl $64, %edi +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm0 +; CHECK-NEXT: movl $buf2, %edx +; CHECK-NEXT: movl $32, %esi +; CHECK-NEXT: tilestored %tmm0, (%rdx,%rsi) +; CHECK-NEXT: leaq -8(%rbp), %rsp +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: tilerelease +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, ptr @buf, i64 32) #3 + %1 = extractvalue { x86_amx, x86_amx } %0, 0 + %2 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %1) #3 + %3 = extractvalue { x86_amx, x86_amx } %0, 1 + %4 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %3) #3 + %5 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #3 + %6 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %5) #3 + %7 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %6) #3 + %8 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %2) #3 + %9 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %4) #3 + %10 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col1, i16 %col0, x86_amx %7, x86_amx %8, x86_amx %9) #3 + %11 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %10) #3 + %12 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %11) #3 + tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col0, ptr @buf2, i64 32, x86_amx %12) #3 + ret void +} + +declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, ptr, i64) #1 + +declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #2 + +declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #3 + +declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #3 + +declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #2 + +declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #4 + +attributes #0 = { nounwind uwtable "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose" } +attributes #1 = { argmemonly nofree nounwind readonly } +attributes #2 = { nofree nosync nounwind readnone } +attributes #3 = { nounwind } +attributes #4 = { argmemonly nounwind writeonly } + +!llvm.module.flags = !{!0, !1, !2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 7, !"uwtable", i32 2} +!2 = !{i32 7, !"frame-pointer", i32 2} diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir b/llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir new file mode 100644 index 0000000000000..dc79134321e9c --- /dev/null +++ b/llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir @@ -0,0 +1,165 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \ +# RUN: -mattr=+amx-transpose -run-pass=fasttileconfig -o - %s | FileCheck %s + +--- +name: test_tile_2rpntlvwz0 +alignment: 16 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +callsEHReturn: false +callsUnwindInit: false +hasEHCatchret: false +hasEHScopes: false +hasEHFunclets: false +failsVerification: false +tracksDebugUserValues: false +registers: [] +liveins: + - { reg: '$edi', virtual-reg: '' } + - { reg: '$esi', virtual-reg: '' } + - { reg: '$edx', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1024 + adjustsStack: false + hasCalls: true + stackProtector: '' + functionContext: '' + maxCallFrameSize: 4294967295 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: '', type: default, offset: 0, size: 8, alignment: 8, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: default, offset: 0, size: 8, alignment: 8, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 2, name: '', type: default, offset: 0, size: 8, alignment: 8, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 3, name: '', type: default, offset: 0, size: 8, alignment: 8, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 4, name: '', type: default, offset: 0, size: 64, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 5, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 6, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 7, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: + amxProgModel: ManagedRA +body: | + bb.0.entry: + liveins: $rdi, $rsi, $rdx, $rax + + ; CHECK-LABEL: name: test_tile_2rpntlvwz0 + ; CHECK: liveins: $rdi, $rsi, $rdx, $rax + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $zmm0 = AVX512_512_SET0 + ; CHECK-NEXT: VMOVUPSZmr %stack.4, 1, $noreg, 0, $noreg, killed renamable $zmm0 :: (store (s512) into %stack.4, align 4) + ; CHECK-NEXT: MOV8mi %stack.4, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.4, align 4) + ; CHECK-NEXT: renamable $rcx = MOV32ri64 64 + ; CHECK-NEXT: MOV64mr %stack.7, 1, $noreg, 0, $noreg, $rcx :: (store (s64) into %stack.7) + ; CHECK-NEXT: renamable $cx = MOV16ri 64 + ; CHECK-NEXT: MOV16mr %stack.5, 1, $noreg, 0, $noreg, $cx :: (store (s16) into %stack.5) + ; CHECK-NEXT: renamable $cx = MOV16ri 16 + ; CHECK-NEXT: renamable $r8w = MOV16ri 16 + ; CHECK-NEXT: MOV16mr %stack.6, 1, $noreg, 0, $noreg, $r8w :: (store (s16) into %stack.6) + ; CHECK-NEXT: $al = IMPLICIT_DEF + ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.4 + 48, align 4) + ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.4 + 16, align 4) + ; CHECK-NEXT: $al = IMPLICIT_DEF + ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 50, $noreg, $al :: (store (s512) into %stack.4 + 50, align 2, basealign 4) + ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 20, $noreg, $cx :: (store (s512) into %stack.4 + 20, align 4) + ; CHECK-NEXT: $al = IMPLICIT_DEF + ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 49, $noreg, $al :: (store (s512) into %stack.4 + 49, align 1, basealign 4) + ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 18, $noreg, $di :: (store (s512) into %stack.4 + 18, align 2, basealign 4) + ; CHECK-NEXT: $al = IMPLICIT_DEF + ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.4 + 48, align 4) + ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.4 + 16, align 4) + ; CHECK-NEXT: $al = IMPLICIT_DEF + ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.4 + 48, align 4) + ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.4 + 16, align 4) + ; CHECK-NEXT: $al = IMPLICIT_DEF + ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 52, $noreg, $al :: (store (s512) into %stack.4 + 52, align 4) + ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 24, $noreg, $cx :: (store (s512) into %stack.4 + 24, align 4) + ; CHECK-NEXT: $al = IMPLICIT_DEF + ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 53, $noreg, $al :: (store (s512) into %stack.4 + 53, align 1, basealign 4) + ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 26, $noreg, $di :: (store (s512) into %stack.4 + 26, align 2, basealign 4) + ; CHECK-NEXT: PLDTILECFGV %stack.4, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.4, align 4) + ; CHECK-NEXT: renamable $r9 = COPY $rsi + ; CHECK-NEXT: $rsi = MOV64rm %stack.7, 1, $noreg, 0, $noreg :: (load (s64) from %stack.7) + ; CHECK-NEXT: renamable $r8 = COPY $rdi + ; CHECK-NEXT: $di = MOV16rm %stack.6, 1, $noreg, 0, $noreg :: (load (s16) from %stack.6) + ; CHECK-NEXT: renamable $r10 = COPY $rax + ; CHECK-NEXT: $ax = MOV16rm %stack.5, 1, $noreg, 0, $noreg :: (load (s16) from %stack.5) + ; CHECK-NEXT: renamable $tmm4_tmm5 = PT2RPNTLVWZ0V renamable $ax, renamable $cx, renamable $di, renamable $rdx, 1, killed renamable $r10, 0, $noreg + ; CHECK-NEXT: renamable $tmm0 = COPY renamable $tmm5 + ; CHECK-NEXT: renamable $tmm1 = COPY renamable $tmm4, implicit killed $tmm4_tmm5 + ; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $cx, renamable $r9, 1, renamable $rsi, 0, $noreg, killed renamable $tmm1 + ; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $di, renamable $r8, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0 + ; CHECK-NEXT: renamable $tmm0 = PTILEZEROV renamable $ax, renamable $cx + ; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0 + ; CHECK-NEXT: renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $r9, 1, renamable $rsi, 0, $noreg + ; CHECK-NEXT: renamable $tmm1 = PTILELOADDV renamable $ax, renamable $di, killed renamable $r8, 1, renamable $rsi, 0, $noreg + ; CHECK-NEXT: renamable $tmm2 = PTILELOADDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg + ; CHECK-NEXT: renamable $tmm0 = PTDPBSSDV renamable $ax, renamable $cx, killed renamable $di, renamable $tmm0, killed renamable $tmm1, killed renamable $tmm2 + ; CHECK-NEXT: PTILESTOREDV killed renamable $ax, killed renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm0 + renamable $zmm0 = AVX512_512_SET0 + VMOVUPSZmr %stack.4, 1, $noreg, 0, $noreg, killed renamable $zmm0 :: (store (s512) into %stack.4, align 4) + MOV8mi %stack.4, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.4, align 4) + renamable $rcx = MOV32ri64 64 + MOV64mr %stack.7, 1, $noreg, 0, $noreg, $rcx :: (store (s64) into %stack.7) + renamable $cx = MOV16ri 64 + MOV16mr %stack.5, 1, $noreg, 0, $noreg, $cx :: (store (s16) into %stack.5) + renamable $cx = MOV16ri 16 + renamable $r8w = MOV16ri 16 + MOV16mr %stack.6, 1, $noreg, 0, $noreg, $r8w :: (store (s16) into %stack.6) + PLDTILECFGV %stack.4, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.4, align 4) + renamable $r9 = COPY $rsi + $rsi = MOV64rm %stack.7, 1, $noreg, 0, $noreg :: (load (s64) from %stack.7) + renamable $r8 = COPY $rdi + $di = MOV16rm %stack.6, 1, $noreg, 0, $noreg :: (load (s16) from %stack.6) + renamable $r10 = COPY $rax + $ax = MOV16rm %stack.5, 1, $noreg, 0, $noreg :: (load (s16) from %stack.5) + renamable $tmm4_tmm5 = PT2RPNTLVWZ0V renamable $ax, renamable $cx, renamable $di, renamable $rdx, 1, killed renamable $r10, 0, $noreg + renamable $tmm0 = COPY renamable $tmm5 + renamable $tmm1 = COPY renamable $tmm4, implicit killed $tmm4_tmm5 + PTILESTOREDV renamable $ax, renamable $cx, renamable $r9, 1, renamable $rsi, 0, $noreg, killed renamable $tmm1 + PTILESTOREDV renamable $ax, renamable $di, renamable $r8, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0 + renamable $tmm0 = PTILEZEROV renamable $ax, renamable $cx + PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0 + renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $r9, 1, renamable $rsi, 0, $noreg + renamable $tmm1 = PTILELOADDV renamable $ax, renamable $di, killed renamable $r8, 1, renamable $rsi, 0, $noreg + renamable $tmm2 = PTILELOADDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg + renamable $tmm0 = PTDPBSSDV renamable $ax, renamable $cx, killed renamable $di, renamable $tmm0, killed renamable $tmm1, killed renamable $tmm2 + PTILESTOREDV killed renamable $ax, killed renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm0 +... diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir b/llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir new file mode 100644 index 0000000000000..e62a52162d523 --- /dev/null +++ b/llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir @@ -0,0 +1,153 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \ +# RUN: -mattr=+amx-transpose -run-pass=greedy,tileconfig -o - %s | FileCheck %s + +--- | + @buf = dso_local global [2048 x i8] zeroinitializer, align 16 + @buf2 = dso_local global [2048 x i8] zeroinitializer, align 16 + + define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1) local_unnamed_addr #0 { + entry: + %0 = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, i8* getelementptr inbounds ([2048 x i8], [2048 x i8]* @buf, i64 0, i64 0), i64 32) #5 + %1 = extractvalue { x86_amx, x86_amx } %0, 0 + %2 = extractvalue { x86_amx, x86_amx } %0, 1 + %3 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #5 + %4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col1, i16 %col0, x86_amx %3, x86_amx %1, x86_amx %2) #5 + tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col0, i8* getelementptr inbounds ([2048 x i8], [2048 x i8]* @buf2, i64 0, i64 0), i64 32, x86_amx %4) #5 + ret void + } + + declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, i8*, i64) #1 + + declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #2 + + declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #3 + + declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #3 + + declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #2 + + declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) #4 + + attributes #0 = { nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="8192" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose,+avx,+avx2,+avx512f,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+amx-tile,+amx-bf16,+avx512f,+amx-transpose" "tune-cpu"="generic" } + attributes #1 = { argmemonly nounwind readonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } + attributes #2 = { nounwind readnone "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } + attributes #3 = { nounwind "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } + attributes #4 = { argmemonly nounwind writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } + attributes #5 = { nounwind } + +... +--- +name: test_tile_2rpntlvwz0 +alignment: 16 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +callsEHReturn: false +callsUnwindInit: false +hasEHCatchret: false +hasEHScopes: false +hasEHFunclets: false +failsVerification: false +tracksDebugUserValues: false +registers: + - { id: 0, class: gr32, preferred-register: '' } + - { id: 1, class: gr32, preferred-register: '' } + - { id: 2, class: gr32, preferred-register: '' } + - { id: 3, class: gr16, preferred-register: '' } + - { id: 4, class: gr16, preferred-register: '' } + - { id: 5, class: gr16, preferred-register: '' } + - { id: 6, class: gr64, preferred-register: '' } + - { id: 7, class: gr64_nosp, preferred-register: '' } + - { id: 8, class: tilepair, preferred-register: '' } + - { id: 9, class: tile, preferred-register: '' } + - { id: 10, class: tile, preferred-register: '' } + - { id: 11, class: tile, preferred-register: '' } + - { id: 12, class: tile, preferred-register: '' } + - { id: 13, class: gr64, preferred-register: '' } + - { id: 14, class: vr512, preferred-register: '' } +liveins: + - { reg: '$edi', virtual-reg: '%0' } + - { reg: '$esi', virtual-reg: '%1' } + - { reg: '$edx', virtual-reg: '%2' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 4 + adjustsStack: false + hasCalls: false + stackProtector: '' + functionContext: '' + maxCallFrameSize: 4294967295 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: '', type: default, offset: 0, size: 64, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: + amxProgModel: ManagedRA +body: | + bb.0.entry: + liveins: $edi, $esi, $edx + + + ; CHECK-LABEL: name: test_tile_2rpntlvwz0 + ; CHECK: liveins: $edi, $esi, $edx + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esi + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr32 = COPY $edi + ; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0 + ; CHECK-NEXT: VMOVUPSZmr %stack.0, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.0, align 4) + ; CHECK-NEXT: MOV8mi %stack.0, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.0, align 4) + ; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 26, $noreg, [[COPY]].sub_16bit :: (store (s512) into %stack.0 + 26, align 2, basealign 4) + ; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 53, $noreg, [[COPY2]].sub_8bit :: (store (s512) into %stack.0 + 53, align 1, basealign 4) + ; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 24, $noreg, [[COPY1]].sub_16bit :: (store (s512) into %stack.0 + 24, align 4) + ; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 52, $noreg, [[COPY2]].sub_8bit :: (store (s512) into %stack.0 + 52, align 4) + ; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 16, $noreg, [[COPY]].sub_16bit :: (store (s512) into %stack.0 + 16, align 4) + ; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 48, $noreg, [[COPY2]].sub_8bit :: (store (s512) into %stack.0 + 48, align 4) + ; CHECK-NEXT: PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.0, align 4) + ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64 = MOV32ri64 @buf + ; CHECK-NEXT: [[MOV32ri64_1:%[0-9]+]]:gr64_nosp = MOV32ri64 32 + ; CHECK-NEXT: [[PT2RPNTLVWZ0V:%[0-9]+]]:tilepair = PT2RPNTLVWZ0V [[COPY2]].sub_16bit, [[COPY1]].sub_16bit, [[COPY]].sub_16bit, [[MOV32ri64_]], 1, [[MOV32ri64_1]], 0, $noreg + ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY2]].sub_16bit, [[COPY1]].sub_16bit + ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTDPBSSDV [[COPY2]].sub_16bit, [[COPY]].sub_16bit, [[COPY1]].sub_16bit, [[PTILEZEROV]], [[PT2RPNTLVWZ0V]].sub_t0, [[PT2RPNTLVWZ0V]].sub_t1 + ; CHECK-NEXT: [[MOV32ri64_2:%[0-9]+]]:gr64 = MOV32ri64 @buf2 + ; CHECK-NEXT: PTILESTOREDV [[COPY2]].sub_16bit, [[COPY1]].sub_16bit, [[MOV32ri64_2]], 1, [[MOV32ri64_1]], 0, $noreg, [[PTILEZEROV]] + ; CHECK-NEXT: RET 0 + %2:gr32 = COPY $edx + %1:gr32 = COPY $esi + %0:gr32 = COPY $edi + %14:vr512 = AVX512_512_SET0 + VMOVUPSZmr %stack.0, 1, $noreg, 0, $noreg, %14 :: (store (s512) into %stack.0, align 4) + MOV8mi %stack.0, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.0, align 4) + PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.0, align 4) + %6:gr64 = MOV32ri64 @buf + %7:gr64_nosp = MOV32ri64 32 + %8:tilepair = PT2RPNTLVWZ0V %0.sub_16bit, %1.sub_16bit, %2.sub_16bit, %6, 1, %7, 0, $noreg + %12:tile = PTILEZEROV %0.sub_16bit, %1.sub_16bit + %12:tile = PTDPBSSDV %0.sub_16bit, %2.sub_16bit, %1.sub_16bit, %12, %8.sub_t0, %8.sub_t1 + %13:gr64 = MOV32ri64 @buf2 + PTILESTOREDV %0.sub_16bit, %1.sub_16bit, %13, 1, %7, 0, $noreg, %12 + RET 0 + +... diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_copy.mir b/llvm/test/CodeGen/X86/amx_tile_pair_copy.mir new file mode 100644 index 0000000000000..857ad433af153 --- /dev/null +++ b/llvm/test/CodeGen/X86/amx_tile_pair_copy.mir @@ -0,0 +1,97 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \ +# RUN: -mattr=+amx-transpose -run-pass=lowertilecopy -o - %s | FileCheck %s + +--- +name: test_tile_2rpntlvwz0 +alignment: 16 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +callsEHReturn: false +callsUnwindInit: false +hasEHCatchret: false +hasEHScopes: false +hasEHFunclets: false +failsVerification: false +tracksDebugUserValues: false +registers: [] +liveins: + - { reg: '$edi', virtual-reg: '' } + - { reg: '$esi', virtual-reg: '' } + - { reg: '$edx', virtual-reg: '' } + - { reg: '$cx', virtual-reg: '' } + - { reg: '$r9', virtual-reg: '' } + - { reg: '$r10', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1024 + adjustsStack: false + hasCalls: true + stackProtector: '' + functionContext: '' + maxCallFrameSize: 4294967295 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 43, name: '', type: default, offset: 0, size: 64, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 68, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: + amxProgModel: ManagedRA +body: | + bb.0.entry: + liveins: $edi, $esi, $edx, $cx, $di, $r8w, $r11, $r10, $rbx, $r8, $r9 + + + ; CHECK-LABEL: name: test_tile_2rpntlvwz0 + ; CHECK: liveins: $edi, $esi, $edx, $cx, $di, $r8w, $r11, $r10, $rbx, $r8, $r9 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.0, align 4) + ; CHECK-NEXT: renamable $tmm4_tmm5 = PT2RPNTLVWZ0V killed renamable $cx, killed renamable $di, killed renamable $r8w, killed renamable $r11, 1, killed renamable $rbx, 0, $noreg + ; CHECK-NEXT: $rax = MOV64ri 64 + ; CHECK-NEXT: TILESTORED %stack.3, 1, $rax, 0, $noreg, $tmm5 :: (store (s8192) into %stack.3) + ; CHECK-NEXT: $tmm0 = TILELOADD %stack.3, 1, killed $rax, 0, $noreg :: (load (s8192) from %stack.3) + ; CHECK-NEXT: $rax = MOV64ri 64 + ; CHECK-NEXT: TILESTORED %stack.2, 1, $rax, 0, $noreg, $tmm4 :: (store (s8192) into %stack.2) + ; CHECK-NEXT: $tmm1 = TILELOADD %stack.2, 1, killed $rax, 0, $noreg :: (load (s8192) from %stack.2) + ; CHECK-NEXT: renamable $r8 = MOV32ri64 64 + ; CHECK-NEXT: MOV64mr %stack.1, 1, $noreg, 0, $noreg, $r8 :: (store (s64) into %stack.1) + ; CHECK-NEXT: renamable $di = MOV16ri 64 + ; CHECK-NEXT: renamable $cx = MOV16ri 16 + ; CHECK-NEXT: PTILESTOREDV renamable $cx, renamable $di, killed renamable $r10, 1, renamable $r8, 0, $noreg, killed renamable $tmm1 + ; CHECK-NEXT: PTILESTOREDV killed renamable $cx, killed renamable $di, killed renamable $r9, 1, renamable $r8, 0, $noreg, killed renamable $tmm0 + PLDTILECFGV %stack.43, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.43, align 4) + renamable $tmm4_tmm5 = PT2RPNTLVWZ0V killed renamable $cx, killed renamable $di, killed renamable $r8w, killed renamable $r11, 1, killed renamable $rbx, 0, $noreg + renamable $tmm0 = COPY renamable $tmm5 + renamable $tmm1 = COPY renamable $tmm4, implicit killed $tmm4_tmm5 + renamable $r8 = MOV32ri64 64 + MOV64mr %stack.68, 1, $noreg, 0, $noreg, $r8 :: (store (s64) into %stack.68) + renamable $di = MOV16ri 64 + renamable $cx = MOV16ri 16 + PTILESTOREDV renamable $cx, renamable $di, killed renamable $r10, 1, renamable $r8, 0, $noreg, killed renamable $tmm1 + PTILESTOREDV killed renamable $cx, killed renamable $di, killed renamable $r9, 1, renamable $r8, 0, $noreg, killed renamable $tmm0 + +... diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll b/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll new file mode 100644 index 0000000000000..52641c65c90e9 --- /dev/null +++ b/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll @@ -0,0 +1,86 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py + ; RUN: opt --codegen-opt-level=0 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s + + @buf = dso_local global [2048 x i8] zeroinitializer, align 16 + + ; Function Attrs: noinline nounwind optnone uwtable + define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1, ptr %m) #0 { +; CHECK-LABEL: @test_tile_2rpntlvwz0( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = udiv i16 [[COL1:%.*]], 4 +; CHECK-NEXT: [[TMP1:%.*]] = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 [[ROW:%.*]], i16 [[COL0:%.*]], i16 [[COL1]], ptr @buf, i64 32) #[[ATTR3:[0-9]+]] +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[COL0]] to i64 +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M:%.*]], i64 [[TMP3]], x86_amx [[TMP2]]) +; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP1]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[COL1]] to i64 +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL1]], ptr [[M]], i64 [[TMP6]], x86_amx [[TMP5]]) +; CHECK-NEXT: [[TMP8:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 [[ROW]], i16 [[COL0]]) #[[ATTR3]] +; CHECK-NEXT: [[TMP9:%.*]] = sext i16 [[COL0]] to i64 +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M]], i64 [[TMP9]], x86_amx [[TMP8]]) +; CHECK-NEXT: [[TMP11:%.*]] = sext i16 [[COL0]] to i64 +; CHECK-NEXT: [[TMP13:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M]], i64 [[TMP11]]) +; CHECK-NEXT: [[TMP14:%.*]] = sext i16 [[COL1]] to i64 +; CHECK-NEXT: [[TMP16:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 [[COL1]], ptr [[M]], i64 [[TMP14]]) +; CHECK-NEXT: [[TMP17:%.*]] = sext i16 [[COL0]] to i64 +; CHECK-NEXT: [[TMP19:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP0]], i16 [[COL0]], ptr [[M]], i64 [[TMP17]]) +; CHECK-NEXT: [[TMP20:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 [[ROW]], i16 [[COL0]], i16 [[COL1]], x86_amx [[TMP13]], x86_amx [[TMP16]], x86_amx [[TMP19]]) #[[ATTR3]] +; CHECK-NEXT: [[TMP21:%.*]] = sext i16 [[COL0]] to i64 +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M]], i64 [[TMP21]], x86_amx [[TMP20]]) +; CHECK-NEXT: ret void +; + entry: + + %0 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, ptr getelementptr inbounds ([2048 x i8], ptr @buf, i64 0, i64 0), i64 32) #7 + %1 = extractvalue { x86_amx, x86_amx } %0, 0 + %2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %1) #7 + store <256 x i32> %2, ptr %m, align 1024 + + %3 = extractvalue { x86_amx, x86_amx } %0, 1 + %4 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %3) #7 + store <256 x i32> %4, ptr %m, align 1024 + + %5 = call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #7 + %6 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %5) #7 + store <256 x i32> %6, ptr %m, align 64 + + %7 = load <256 x i32>, ptr %m, align 64 + %8 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %7) #7 + %9 = load <256 x i32>, ptr %m, align 64 + %10 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %9) #7 + %11 = load <256 x i32>, ptr %m, align 64 + %12 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %11) #7 + + %13 = call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col0, i16 %col1, x86_amx %8, x86_amx %10, x86_amx %12) #7 + %14 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %13) #7 + store <256 x i32> %14, ptr %m, align 64 + + ret void + } + + ; Function Attrs: argmemonly nounwind readonly + declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, ptr, i64) #2 + + ; Function Attrs: nounwind readnone + declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #3 + + ; Function Attrs: nounwind + declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #4 + + ; Function Attrs: nounwind + declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #4 + + ; Function Attrs: nounwind readnone + declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #3 + + ; Function Attrs: argmemonly nounwind writeonly + declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #5 + + attributes #0 = { noinline nounwind optnone uwtable "frame-pointer"="all" "min-legal-vector-width"="8192" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose,+avx,+avx2,+avx512f,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+amx-tile,+amx-bf16,+avx512f,+amx-transpose" "tune-cpu"="generic" } + attributes #1 = { argmemonly nofree nounwind willreturn writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } + attributes #2 = { argmemonly nounwind readonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } + attributes #3 = { nounwind readnone "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } + attributes #4 = { nounwind "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } + attributes #5 = { argmemonly nounwind writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } + attributes #6 = { argmemonly nofree nounwind willreturn "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } + attributes #7 = { nounwind } diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll b/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll new file mode 100644 index 0000000000000..346d46b6b16c2 --- /dev/null +++ b/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll @@ -0,0 +1,60 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s + + @buf = dso_local global [2048 x i8] zeroinitializer, align 16 + @buf2 = dso_local global [2048 x i8] zeroinitializer, align 16 + + ; Function Attrs: nounwind uwtable + define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1) local_unnamed_addr #0 { +; CHECK-LABEL: @test_tile_2rpntlvwz0( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 [[ROW:%.*]], i16 [[COL0:%.*]], i16 [[COL1:%.*]], ptr @buf, i64 32) #[[ATTR3:[0-9]+]] +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP0]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP0]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_amx @llvm.x86.tilezero.internal(i16 [[ROW]], i16 [[COL0]]) #[[ATTR3]] +; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 [[ROW]], i16 [[COL1]], i16 [[COL0]], x86_amx [[TMP3]], x86_amx [[TMP1]], x86_amx [[TMP2]]) #[[ATTR3]] +; CHECK-NEXT: tail call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr @buf2, i64 32, x86_amx [[TMP4]]) #[[ATTR3]] +; CHECK-NEXT: ret void +; + entry: + %0 = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, ptr @buf, i64 32) #5 + %1 = extractvalue { x86_amx, x86_amx } %0, 0 + %2 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %1) #5 + %3 = extractvalue { x86_amx, x86_amx } %0, 1 + %4 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %3) #5 + %5 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #5 + %6 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %5) #5 + %7 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %6) #5 + %8 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %2) #5 + %9 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %4) #5 + %10 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col1, i16 %col0, x86_amx %7, x86_amx %8, x86_amx %9) #5 + %11 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %10) #5 + %12 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %11) #5 + tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col0, ptr @buf2, i64 32, x86_amx %12) #5 + ret void + } + + ; Function Attrs: argmemonly nounwind readonly + declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, ptr, i64) #1 + + ; Function Attrs: nounwind readnone + declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #2 + + ; Function Attrs: nounwind + declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #3 + + ; Function Attrs: nounwind + declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #3 + + ; Function Attrs: nounwind readnone + declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #2 + + ; Function Attrs: argmemonly nounwind writeonly + declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #4 + + attributes #0 = { nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="8192" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose,+avx,+avx2,+avx512f,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+amx-tile,+amx-bf16,+avx512f,+amx-transpose" "tune-cpu"="generic" } + attributes #1 = { argmemonly nounwind readonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } + attributes #2 = { nounwind readnone "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } + attributes #3 = { nounwind "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } + attributes #4 = { argmemonly nounwind writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } + attributes #5 = { nounwind } diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir b/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir new file mode 100644 index 0000000000000..cdc525193fef7 --- /dev/null +++ b/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir @@ -0,0 +1,134 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \ +# RUN: -mattr=+amx-transpose -run-pass=fastpretileconfig -o - %s | FileCheck %s + +--- +name: test_tile_2rpntlvwz0 +alignment: 16 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +callsEHReturn: false +callsUnwindInit: false +hasEHCatchret: false +hasEHScopes: false +hasEHFunclets: false +failsVerification: false +tracksDebugUserValues: false +registers: + - { id: 0, class: gr64_nosp, preferred-register: '' } + - { id: 1, class: gr16, preferred-register: '' } + - { id: 2, class: gr16, preferred-register: '' } + - { id: 3, class: gr16, preferred-register: '' } + - { id: 4, class: gr64, preferred-register: '' } + - { id: 5, class: gr64, preferred-register: '' } + - { id: 6, class: gr64, preferred-register: '' } + - { id: 7, class: gr64_nosp, preferred-register: '' } + - { id: 8, class: tilepair, preferred-register: '' } + - { id: 9, class: tile, preferred-register: '' } + - { id: 10, class: tile, preferred-register: '' } + - { id: 11, class: tile, preferred-register: '' } + - { id: 181, class: tile, preferred-register: '' } + - { id: 183, class: tile, preferred-register: '' } + - { id: 185, class: tile, preferred-register: '' } + - { id: 186, class: tile, preferred-register: '' } +liveins: + - { reg: '$edi', virtual-reg: '%0' } + - { reg: '$esi', virtual-reg: '%1' } + - { reg: '$edx', virtual-reg: '%2' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1024 + adjustsStack: false + hasCalls: true + stackProtector: '' + functionContext: '' + maxCallFrameSize: 4294967295 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 18, name: '', type: default, offset: 0, size: 8, alignment: 8, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 19, name: '', type: default, offset: 0, size: 8, alignment: 8, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 20, name: '', type: default, offset: 0, size: 8, alignment: 8, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 21, name: '', type: default, offset: 0, size: 8, + alignment: 8, stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: + amxProgModel: ManagedRA +body: | + bb.0.entry: + liveins: $rdi, $rsi, $rdx, $rax + + ; CHECK-LABEL: name: test_tile_2rpntlvwz0 + ; CHECK: liveins: $rdi, $rsi, $rdx, $rax + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0 + ; CHECK-NEXT: VMOVUPSZmr %stack.4, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.4, align 4) + ; CHECK-NEXT: MOV8mi %stack.4, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.4, align 4) + ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64_nosp = MOV32ri64 64 + ; CHECK-NEXT: [[MOV16ri:%[0-9]+]]:gr16 = MOV16ri 64 + ; CHECK-NEXT: [[MOV16ri1:%[0-9]+]]:gr16 = MOV16ri 16 + ; CHECK-NEXT: [[MOV16ri2:%[0-9]+]]:gr16 = MOV16ri 16 + ; CHECK-NEXT: PLDTILECFGV %stack.4, 1, $noreg, 0, $noreg, implicit-def $tmm0, implicit-def $tmm1, implicit-def $tmm2, implicit-def $tmm3, implicit-def $tmm4, implicit-def $tmm5, implicit-def $tmm6, implicit-def $tmm7 :: (load (s512) from %stack.4, align 4) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY $rsi + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY $rdi + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64 = COPY $rdx + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr64_nosp = COPY $rax + ; CHECK-NEXT: [[PT2RPNTLVWZ0V:%[0-9]+]]:tilepair = PT2RPNTLVWZ0V [[MOV16ri]], [[MOV16ri1]], [[MOV16ri2]], [[COPY2]], 1, killed [[COPY3]], 0, $noreg + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t1 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t0 + ; CHECK-NEXT: PTILESTOREDV [[MOV16ri]], [[MOV16ri1]], [[COPY]], 1, [[MOV32ri64_]], 0, $noreg, killed [[COPY5]] + ; CHECK-NEXT: PTILESTOREDV [[MOV16ri]], [[MOV16ri2]], [[COPY1]], 1, [[MOV32ri64_]], 0, $noreg, killed [[COPY4]] + ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[MOV16ri]], [[MOV16ri1]] + ; CHECK-NEXT: PTILESTOREDV [[MOV16ri]], [[MOV16ri1]], [[COPY2]], 1, [[MOV32ri64_]], 0, $noreg, killed [[PTILEZEROV]] + ; CHECK-NEXT: [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri]], [[MOV16ri1]], [[COPY]], 1, [[MOV32ri64_]], 0, $noreg + ; CHECK-NEXT: [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri]], [[MOV16ri2]], [[COPY1]], 1, [[MOV32ri64_]], 0, $noreg + ; CHECK-NEXT: [[PTILELOADDV2:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri]], [[MOV16ri1]], [[COPY2]], 1, [[MOV32ri64_]], 0, $noreg + ; CHECK-NEXT: [[PTDPBSSDV:%[0-9]+]]:tile = PTDPBSSDV [[MOV16ri]], [[MOV16ri1]], [[MOV16ri2]], [[PTILELOADDV]], killed [[PTILELOADDV1]], killed [[PTILELOADDV2]] + ; CHECK-NEXT: PTILESTOREDV killed [[MOV16ri]], killed [[MOV16ri1]], killed [[COPY2]], 1, killed [[MOV32ri64_]], 0, $noreg, killed [[PTDPBSSDV]] + %0:gr64_nosp = MOV32ri64 64 + %1:gr16 = MOV16ri 64 + %2:gr16 = MOV16ri 16 + %3:gr16 = MOV16ri 16 + %4:gr64 = COPY $rsi + %5:gr64 = COPY $rdi + %6:gr64 = COPY $rdx + %7:gr64_nosp = COPY $rax + %8:tilepair = PT2RPNTLVWZ0V %1, %2, %3, %6, 1, killed %7, 0, $noreg + %9:tile = COPY %8.sub_t1 + %10:tile = COPY %8.sub_t0 + PTILESTOREDV %1, %2, %4, 1, %0, 0, $noreg, killed %10 + PTILESTOREDV %1, %3, %5, 1, %0, 0, $noreg, killed %9 + %11:tile = PTILEZEROV %1, %2 + PTILESTOREDV %1, %2, %6, 1, %0, 0, $noreg, killed %11 + %181:tile = PTILELOADDV %1, %2, %4, 1, %0, 0, $noreg + %183:tile = PTILELOADDV %1, %3, %5, 1, %0, 0, $noreg + %185:tile = PTILELOADDV %1, %2, %6, 1, %0, 0, $noreg + %186:tile = PTDPBSSDV %1, %2, %3, %181, killed %183, killed %185 + PTILESTOREDV killed %1, killed %2, killed %6, 1, killed %0, 0, $noreg, killed %186 +... diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir b/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir new file mode 100644 index 0000000000000..a9824dcac6b04 --- /dev/null +++ b/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir @@ -0,0 +1,113 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \ +# RUN: -mattr=+amx-transpose -run-pass=tilepreconfig -o - %s | FileCheck %s + +--- +name: test_tile_2rpntlvwz0 +alignment: 16 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +callsEHReturn: false +callsUnwindInit: false +hasEHCatchret: false +hasEHScopes: false +hasEHFunclets: false +failsVerification: false +tracksDebugUserValues: false +registers: + - { id: 0, class: gr32, preferred-register: '' } + - { id: 1, class: gr32, preferred-register: '' } + - { id: 2, class: gr32, preferred-register: '' } + - { id: 3, class: gr16, preferred-register: '' } + - { id: 4, class: gr16, preferred-register: '' } + - { id: 5, class: gr16, preferred-register: '' } + - { id: 6, class: gr64, preferred-register: '' } + - { id: 7, class: gr64_nosp, preferred-register: '' } + - { id: 8, class: tilepair, preferred-register: '' } + - { id: 9, class: tile, preferred-register: '' } + - { id: 10, class: tile, preferred-register: '' } + - { id: 11, class: tile, preferred-register: '' } + - { id: 12, class: tile, preferred-register: '' } + - { id: 13, class: gr64, preferred-register: '' } +liveins: + - { reg: '$edi', virtual-reg: '%0' } + - { reg: '$esi', virtual-reg: '%1' } + - { reg: '$edx', virtual-reg: '%2' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1 + adjustsStack: false + hasCalls: false + stackProtector: '' + functionContext: '' + maxCallFrameSize: 4294967295 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: [] +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: + amxProgModel: ManagedRA +body: | + bb.0.entry: + liveins: $edi, $esi, $edx, $rax, $rbx + + ; CHECK-LABEL: name: test_tile_2rpntlvwz0 + ; CHECK: liveins: $edi, $esi, $edx, $rax, $rbx + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0 + ; CHECK-NEXT: VMOVUPSZmr %stack.0, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.0, align 4) + ; CHECK-NEXT: MOV8mi %stack.0, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.0, align 4) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esi + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr32 = COPY $edi + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr16 = COPY [[COPY]].sub_16bit + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gr16 = COPY [[COPY1]].sub_16bit + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gr16 = COPY [[COPY2]].sub_16bit + ; CHECK-NEXT: PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def $tmm0, implicit-def $tmm1, implicit-def $tmm2, implicit-def $tmm3, implicit-def $tmm4, implicit-def $tmm5, implicit-def $tmm6, implicit-def $tmm7 :: (load (s512) from %stack.0, align 4) + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64 = COPY $rax + ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64_nosp = MOV32ri64 32 + ; CHECK-NEXT: [[PT2RPNTLVWZ0V:%[0-9]+]]:tilepair = PT2RPNTLVWZ0V [[COPY5]], [[COPY4]], [[COPY3]], killed [[COPY6]], 1, [[MOV32ri64_]], 0, $noreg + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t1 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t0 + ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY5]], [[COPY4]] + ; CHECK-NEXT: [[PTDPBSSDV:%[0-9]+]]:tile = PTDPBSSDV [[COPY5]], [[COPY3]], [[COPY4]], [[PTILEZEROV]], killed [[COPY8]], killed [[COPY7]] + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gr64 = COPY $rbx + ; CHECK-NEXT: PTILESTOREDV [[COPY5]], [[COPY4]], killed [[COPY9]], 1, [[MOV32ri64_]], 0, $noreg, killed [[PTDPBSSDV]] + ; CHECK-NEXT: RET 0 + %2:gr32 = COPY $edx + %1:gr32 = COPY $esi + %0:gr32 = COPY $edi + %3:gr16 = COPY %2.sub_16bit + %4:gr16 = COPY %1.sub_16bit + %5:gr16 = COPY %0.sub_16bit + %6:gr64 = COPY $rax + %7:gr64_nosp = MOV32ri64 32 + %8:tilepair = PT2RPNTLVWZ0V %5, %4, %3, killed %6, 1, %7, 0, $noreg + %9:tile = COPY %8.sub_t1 + %10:tile = COPY %8.sub_t0 + %11:tile = PTILEZEROV %5, %4 + %12:tile = PTDPBSSDV %5, %3, %4, %11, killed %10, killed %9 + %13:gr64 = COPY $rbx + PTILESTOREDV %5, %4, killed %13, 1, %7, 0, $noreg, killed %12 + RET 0 + +... diff --git a/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll b/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll new file mode 100644 index 0000000000000..b06a9369b9762 --- /dev/null +++ b/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll @@ -0,0 +1,150 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+amx-tile,+amx-bf16,+amx-int8,+amx-transpose | FileCheck %s + +define void @test_amx(i32 %rv32, i64 %stride, i64 %rvalue, i8* %addr1, <4 x float> %xmm) #0 { +; CHECK-LABEL: test_amx: +; CHECK: # %bb.0: +; CHECK-NEXT: t2rpntlvwz0 (%rcx,%rsi), %tmm0 +; CHECK-NEXT: t2rpntlvwz0t1 (%rcx,%rsi), %tmm2 +; CHECK-NEXT: t2rpntlvwz1 (%rcx,%rsi), %tmm0 +; CHECK-NEXT: t2rpntlvwz1t1 (%rcx,%rsi), %tmm2 +; CHECK-NEXT: ttransposed %tmm3, %tmm1 +; CHECK-NEXT: retq + call void @llvm.x86.t2rpntlvwz0(i8 1, i8* %addr1, i64 %stride) + call void @llvm.x86.t2rpntlvwz0t1(i8 2, i8* %addr1, i64 %stride) + call void @llvm.x86.t2rpntlvwz1(i8 1, i8* %addr1, i64 %stride) + call void @llvm.x86.t2rpntlvwz1t1(i8 2, i8* %addr1, i64 %stride) + call void @llvm.x86.ttransposed(i8 1, i8 3) + ret void +} + +declare void @llvm.x86.t2rpntlvwz0(i8 %tile1, i8* %addr1, i64 %stride) +declare void @llvm.x86.t2rpntlvwz0t1(i8 %tile1, i8* %addr1, i64 %stride) +declare void @llvm.x86.t2rpntlvwz1(i8 %tile1, i8* %addr1, i64 %stride) +declare void @llvm.x86.t2rpntlvwz1t1(i8 %tile1, i8* %addr1, i64 %stride) +declare void @llvm.x86.ttransposed(i8 %tile0, i8 %tile1) + +define void @test_amx3(i8* %pointer, i8* %base, i64 %stride) #0 { +; CHECK-LABEL: test_amx3: +; CHECK: # %bb.0: +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: movw $8, %cx +; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4 +; CHECK-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm4 +; CHECK-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm4 +; CHECK-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm4 +; CHECK-NEXT: ttransposed %tmm4, %tmm0 +; CHECK-NEXT: tilestored %tmm0, (%rdi,%rdx) +; CHECK-NEXT: tilerelease +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %1 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride) + %2 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride) + %3 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride) + %4 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride) + %5 = extractvalue { x86_amx, x86_amx } %4, 0 + %6 = call x86_amx @llvm.x86.ttransposed.internal(i16 8, i16 8, x86_amx %5) + call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %6) + ret void +} + +define void @test_amx_spill(i8* %pointer, i8* %base, i64 %stride) #0 { +; CHECK-LABEL: test_amx_spill: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $6088, %rsp # imm = 0x17C8 +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, %ax +; CHECK-NEXT: tileloadd (%rsi,%rdx), %tmm0 +; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4 +; CHECK-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm6 +; CHECK-NEXT: tilestored %tmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 1024-byte Folded Spill +; CHECK-NEXT: tilestored %tmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 1024-byte Folded Spill +; CHECK-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm6 +; CHECK-NEXT: tilestored %tmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 1024-byte Folded Spill +; CHECK-NEXT: tilestored %tmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 1024-byte Folded Spill +; CHECK-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm6 +; CHECK-NEXT: tilestored %tmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 1024-byte Folded Spill +; CHECK-NEXT: tilestored %tmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 1024-byte Folded Spill +; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm6 +; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx) +; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx) +; CHECK-NEXT: movabsq $64, %rcx +; CHECK-NEXT: tileloadd 4032(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload +; CHECK-NEXT: tileloadd 5056(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload +; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx) +; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx) +; CHECK-NEXT: tileloadd 1984(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload +; CHECK-NEXT: tileloadd 3008(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload +; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx) +; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx) +; CHECK-NEXT: tileloadd -64(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload +; CHECK-NEXT: tileloadd 960(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload +; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx) +; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx) +; CHECK-NEXT: tilestored %tmm6, (%rsi,%rdx) +; CHECK-NEXT: tilestored %tmm7, (%rsi,%rdx) +; CHECK-NEXT: addq $6088, %rsp # imm = 0x17C8 +; CHECK-NEXT: tilerelease +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride) + %b1 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) + %b2 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) + %b3 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) + %b4 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) + %b5 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) + %e11 = extractvalue { x86_amx, x86_amx } %b1, 0 + %e12 = extractvalue { x86_amx, x86_amx } %b1, 1 + %e21 = extractvalue { x86_amx, x86_amx } %b2, 0 + %e22 = extractvalue { x86_amx, x86_amx } %b2, 1 + %e31 = extractvalue { x86_amx, x86_amx } %b3, 0 + %e32 = extractvalue { x86_amx, x86_amx } %b3, 1 + %e41 = extractvalue { x86_amx, x86_amx } %b4, 0 + %e42 = extractvalue { x86_amx, x86_amx } %b4, 1 + %e51 = extractvalue { x86_amx, x86_amx } %b5, 0 + %e52 = extractvalue { x86_amx, x86_amx } %b5, 1 + call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e11) + call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e12) + call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e21) + call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e22) + call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e31) + call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e32) + call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e41) + call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e42) + call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e51) + call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e52) + ret void +} + +declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) +declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) +declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, i8*, i64) +declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16, i16, i16, i8*, i64) +declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16, i16, i16, i8*, i64) +declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16, i16, i16, i8*, i64) +declare x86_amx @llvm.x86.ttransposed.internal(i16, i16, x86_amx) + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/X86/ipra-reg-usage.ll b/llvm/test/CodeGen/X86/ipra-reg-usage.ll index d1b8be15a2d03..9b123b730a214 100644 --- a/llvm/test/CodeGen/X86/ipra-reg-usage.ll +++ b/llvm/test/CodeGen/X86/ipra-reg-usage.ll @@ -3,7 +3,7 @@ target triple = "x86_64-unknown-unknown" declare void @bar1() define preserve_allcc void @foo()#0 { -; CHECK: foo Clobbered Registers: $cs $df $ds $eflags $eip $eiz $es $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hip $hsp $ip $mxcsr $rflags $rip $riz $rsp $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $r11b $r11bh $r11d $r11w $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh +; CHECK: foo Clobbered Registers: $cs $df $ds $eflags $eip $eiz $es $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hip $hsp $ip $mxcsr $rflags $rip $riz $rsp $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $r11b $r11bh $r11d $r11w $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $tmm0_tmm1 $tmm2_tmm3 $tmm4_tmm5 $tmm6_tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh call void @bar1() call void @bar2() ret void @@ -11,7 +11,7 @@ define preserve_allcc void @foo()#0 { declare void @bar2() define preserve_nonecc void @foo2()#0 { -; CHECK: foo2 Clobbered Registers: $ah $al $ax $ch $cl $cs $cx $df $dh $di $dih $dil $dl $ds $dx $eax $ecx $edi $edx $eflags $eip $eiz $es $esi $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hax $hcx $hdi $hdx $hip $hsi $hsp $ip $mxcsr $rax $rcx $rdi $rdx $rflags $rip $riz $rsi $rsp $si $sih $sil $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r8 $r9 $r10 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $xmm0 $xmm1 $xmm2 $xmm3 $xmm4 $xmm5 $xmm6 $xmm7 $xmm8 $xmm9 $xmm10 $xmm11 $xmm12 $xmm13 $xmm14 $xmm15 $r8b $r9b $r10b $r11b $r8bh $r9bh $r10bh $r11bh $r8d $r9d $r10d $r11d $r8w $r9w $r10w $r11w $r8wh $r9wh $r10wh $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh +; CHECK: foo2 Clobbered Registers: $ah $al $ax $ch $cl $cs $cx $df $dh $di $dih $dil $dl $ds $dx $eax $ecx $edi $edx $eflags $eip $eiz $es $esi $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hax $hcx $hdi $hdx $hip $hsi $hsp $ip $mxcsr $rax $rcx $rdi $rdx $rflags $rip $riz $rsi $rsp $si $sih $sil $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r8 $r9 $r10 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $xmm0 $xmm1 $xmm2 $xmm3 $xmm4 $xmm5 $xmm6 $xmm7 $xmm8 $xmm9 $xmm10 $xmm11 $xmm12 $xmm13 $xmm14 $xmm15 $r8b $r9b $r10b $r11b $r8bh $r9bh $r10bh $r11bh $r8d $r9d $r10d $r11d $r8w $r9w $r10w $r11w $r8wh $r9wh $r10wh $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $tmm0_tmm1 $tmm2_tmm3 $tmm4_tmm5 $tmm6_tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh call void @bar1() call void @bar2() ret void diff --git a/llvm/test/MC/Disassembler/X86/amx-transpose-att.s b/llvm/test/MC/Disassembler/X86/amx-transpose-att.s new file mode 100644 index 0000000000000..da3fa95ef6dd0 --- /dev/null +++ b/llvm/test/MC/Disassembler/X86/amx-transpose-att.s @@ -0,0 +1,57 @@ +// RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding %s | FileCheck %s + +// CHECK: t2rpntlvwz0 268435456(%rbp,%r14,8), %tmm4 +// CHECK: encoding: [0xc4,0xa2,0x78,0x6e,0xa4,0xf5,0x00,0x00,0x00,0x10] + t2rpntlvwz0 268435456(%rbp,%r14,8), %tmm4 + +// CHECK: t2rpntlvwz0 291(%r8,%rax,4), %tmm2 +// CHECK: encoding: [0xc4,0xc2,0x78,0x6e,0x94,0x80,0x23,0x01,0x00,0x00] + t2rpntlvwz0 291(%r8,%rax,4), %tmm2 + +// CHECK: t2rpntlvwz0 -32(,%rbp,2), %tmm2 +// CHECK: encoding: [0xc4,0xe2,0x78,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff] + t2rpntlvwz0 -32(,%rbp,2), %tmm2 + +// CHECK: t2rpntlvwz0t1 268435456(%rbp,%r14,8), %tmm4 +// CHECK: encoding: [0xc4,0xa2,0x78,0x6f,0xa4,0xf5,0x00,0x00,0x00,0x10] + t2rpntlvwz0t1 268435456(%rbp,%r14,8), %tmm5 + +// CHECK: t2rpntlvwz0t1 291(%r8,%rax,4), %tmm2 +// CHECK: encoding: [0xc4,0xc2,0x78,0x6f,0x94,0x80,0x23,0x01,0x00,0x00] + t2rpntlvwz0t1 291(%r8,%rax,4), %tmm2 + +// CHECK: t2rpntlvwz0t1 -32(,%rbp,2), %tmm2 +// CHECK: encoding: [0xc4,0xe2,0x78,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff] + t2rpntlvwz0t1 -32(,%rbp,2), %tmm2 + +// CHECK: t2rpntlvwz1 268435456(%rbp,%r14,8), %tmm4 +// CHECK: encoding: [0xc4,0xa2,0x79,0x6e,0xa4,0xf5,0x00,0x00,0x00,0x10] + t2rpntlvwz1 268435456(%rbp,%r14,8), %tmm5 + +// CHECK: t2rpntlvwz1 291(%r8,%rax,4), %tmm2 +// CHECK: encoding: [0xc4,0xc2,0x79,0x6e,0x94,0x80,0x23,0x01,0x00,0x00] + t2rpntlvwz1 291(%r8,%rax,4), %tmm2 + +// CHECK: t2rpntlvwz1 -32(,%rbp,2), %tmm2 +// CHECK: encoding: [0xc4,0xe2,0x79,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff] + t2rpntlvwz1 -32(,%rbp,2), %tmm2 + +// CHECK: t2rpntlvwz1t1 268435456(%rbp,%r14,8), %tmm2 +// CHECK: encoding: [0xc4,0xa2,0x79,0x6f,0x94,0xf5,0x00,0x00,0x00,0x10] + t2rpntlvwz1t1 268435456(%rbp,%r14,8), %tmm3 + +// CHECK: t2rpntlvwz1t1 291(%r8,%rax,4), %tmm2 +// CHECK: encoding: [0xc4,0xc2,0x79,0x6f,0x94,0x80,0x23,0x01,0x00,0x00] + t2rpntlvwz1t1 291(%r8,%rax,4), %tmm2 + +// CHECK: t2rpntlvwz1t1 -32(,%rbp,2), %tmm2 +// CHECK: encoding: [0xc4,0xe2,0x79,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff] + t2rpntlvwz1t1 -32(,%rbp,2), %tmm2 + +// CHECK: ttransposed %tmm1, %tmm5 +// CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xe9] + ttransposed %tmm1, %tmm5 + +// CHECK: ttransposed %tmm2, %tmm3 +// CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xda] + ttransposed %tmm2, %tmm3 diff --git a/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt b/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt new file mode 100644 index 0000000000000..e4f1689639ef9 --- /dev/null +++ b/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt @@ -0,0 +1,58 @@ +# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT +# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL + +# ATT: t2rpntlvwz0 268435456(%rbp,%r14,8), %tmm4 +# INTEL: t2rpntlvwz0 tmm4, [rbp + 8*r14 + 268435456] +0xc4,0xa2,0x78,0x6e,0xa4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: t2rpntlvwz0 291(%r8,%rax,4), %tmm2 +# INTEL: t2rpntlvwz0 tmm2, [r8 + 4*rax + 291] +0xc4,0xc2,0x78,0x6e,0x94,0x80,0x23,0x01,0x00,0x00 + +# ATT: t2rpntlvwz0 -32(,%rbp,2), %tmm2 +# INTEL: t2rpntlvwz0 tmm2, [2*rbp - 32] +0xc4,0xe2,0x78,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff + +# ATT: t2rpntlvwz0t1 268435456(%rbp,%r14,8), %tmm4 +# INTEL: t2rpntlvwz0t1 tmm4, [rbp + 8*r14 + 268435456] +0xc4,0xa2,0x78,0x6f,0xa4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: t2rpntlvwz0t1 291(%r8,%rax,4), %tmm2 +# INTEL: t2rpntlvwz0t1 tmm2, [r8 + 4*rax + 291] +0xc4,0xc2,0x78,0x6f,0x94,0x80,0x23,0x01,0x00,0x00 + +# ATT: t2rpntlvwz0t1 -32(,%rbp,2), %tmm2 +# INTEL: t2rpntlvwz0t1 tmm2, [2*rbp - 32] +0xc4,0xe2,0x78,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff + +# ATT: t2rpntlvwz1 268435456(%rbp,%r14,8), %tmm4 +# INTEL: t2rpntlvwz1 tmm4, [rbp + 8*r14 + 268435456] +0xc4,0xa2,0x79,0x6e,0xa4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: t2rpntlvwz1 291(%r8,%rax,4), %tmm2 +# INTEL: t2rpntlvwz1 tmm2, [r8 + 4*rax + 291] +0xc4,0xc2,0x79,0x6e,0x94,0x80,0x23,0x01,0x00,0x00 + +# ATT: t2rpntlvwz1 -32(,%rbp,2), %tmm2 +# INTEL: t2rpntlvwz1 tmm2, [2*rbp - 32] +0xc4,0xe2,0x79,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff + +# ATT: t2rpntlvwz1t1 268435456(%rbp,%r14,8), %tmm4 +# INTEL: t2rpntlvwz1t1 tmm4, [rbp + 8*r14 + 268435456] +0xc4,0xa2,0x79,0x6f,0xa4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: t2rpntlvwz1t1 291(%r8,%rax,4), %tmm2 +# INTEL: t2rpntlvwz1t1 tmm2, [r8 + 4*rax + 291] +0xc4,0xc2,0x79,0x6f,0x94,0x80,0x23,0x01,0x00,0x00 + +# ATT: t2rpntlvwz1t1 -32(,%rbp,2), %tmm2 +# INTEL: t2rpntlvwz1t1 tmm2, [2*rbp - 32] +0xc4,0xe2,0x79,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff + +# ATT: ttransposed %tmm1, %tmm2 +# INTEL: ttransposed tmm2, tmm1 +0xc4,0xe2,0x7a,0x5f,0xd1 + +# ATT: ttransposed %tmm2, %tmm3 +# INTEL: ttransposed tmm3, tmm2 +0xc4,0xe2,0x7a,0x5f,0xda diff --git a/llvm/test/MC/Disassembler/X86/amx-transpose-intel.s b/llvm/test/MC/Disassembler/X86/amx-transpose-intel.s new file mode 100644 index 0000000000000..3b8dfaed313d6 --- /dev/null +++ b/llvm/test/MC/Disassembler/X86/amx-transpose-intel.s @@ -0,0 +1,57 @@ +// RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s + +// CHECK: t2rpntlvwz0 tmm6, [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0xa2,0x78,0x6e,0xb4,0xf5,0x00,0x00,0x00,0x10] + t2rpntlvwz0 tmm6, [rbp + 8*r14 + 268435456] + +// CHECK: t2rpntlvwz0 tmm2, [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0xc2,0x78,0x6e,0x94,0x80,0x23,0x01,0x00,0x00] + t2rpntlvwz0 tmm2, [r8 + 4*rax + 291] + +// CHECK: t2rpntlvwz0 tmm2, [2*rbp - 32] +// CHECK: encoding: [0xc4,0xe2,0x78,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff] + t2rpntlvwz0 tmm2, [2*rbp - 32] + +// CHECK: t2rpntlvwz0t1 tmm6, [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0xa2,0x78,0x6f,0xb4,0xf5,0x00,0x00,0x00,0x10] + t2rpntlvwz0t1 tmm7, [rbp + 8*r14 + 268435456] + +// CHECK: t2rpntlvwz0t1 tmm2, [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0xc2,0x78,0x6f,0x94,0x80,0x23,0x01,0x00,0x00] + t2rpntlvwz0t1 tmm2, [r8 + 4*rax + 291] + +// CHECK: t2rpntlvwz0t1 tmm2, [2*rbp - 32] +// CHECK: encoding: [0xc4,0xe2,0x78,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff] + t2rpntlvwz0t1 tmm2, [2*rbp - 32] + +// CHECK: t2rpntlvwz1 tmm0, [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0xa2,0x79,0x6e,0x84,0xf5,0x00,0x00,0x00,0x10] + t2rpntlvwz1 tmm1, [rbp + 8*r14 + 268435456] + +// CHECK: t2rpntlvwz1 tmm2, [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0xc2,0x79,0x6e,0x94,0x80,0x23,0x01,0x00,0x00] + t2rpntlvwz1 tmm2, [r8 + 4*rax + 291] + +// CHECK: t2rpntlvwz1 tmm2, [2*rbp - 32] +// CHECK: encoding: [0xc4,0xe2,0x79,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff] + t2rpntlvwz1 tmm2, [2*rbp - 32] + +// CHECK: t2rpntlvwz1t1 tmm6, [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0xa2,0x79,0x6f,0xb4,0xf5,0x00,0x00,0x00,0x10] + t2rpntlvwz1t1 tmm6, [rbp + 8*r14 + 268435456] + +// CHECK: t2rpntlvwz1t1 tmm2, [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0xc2,0x79,0x6f,0x94,0x80,0x23,0x01,0x00,0x00] + t2rpntlvwz1t1 tmm2, [r8 + 4*rax + 291] + +// CHECK: t2rpntlvwz1t1 tmm2, [2*rbp - 32] +// CHECK: encoding: [0xc4,0xe2,0x79,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff] + t2rpntlvwz1t1 tmm2, [2*rbp - 32] + +// CHECK: ttransposed tmm5, tmm1 +// CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xe9] + ttransposed tmm5, tmm1 + +// CHECK: ttransposed tmm3, tmm2 +// CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xda] + ttransposed tmm3, tmm2 diff --git a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp index 8f22d097681b1..87f96ed28e326 100644 --- a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp +++ b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp @@ -1113,7 +1113,7 @@ TEST_F(InstrRefLDVTest, MLocDiamondSpills) { // Create a stack location and ensure it's tracked. SpillLoc SL = {getRegByName("RSP"), StackOffset::getFixed(-8)}; SpillLocationNo SpillNo = *MTracker->getOrTrackSpillLoc(SL); - ASSERT_EQ(MTracker->getNumLocs(), 11u); // Tracks all possible stack locs. + ASSERT_EQ(MTracker->getNumLocs(), 13u); // Tracks all possible stack locs. // Locations are: RSP, stack slots from 2^3 bits wide up to 2^9 for zmm regs, // then slots for sub_8bit_hi and sub_16bit_hi ({8, 8} and {16, 16}). // Finally, one for spilt fp80 registers. @@ -1135,7 +1135,7 @@ TEST_F(InstrRefLDVTest, MLocDiamondSpills) { // There are other locations, for things like xmm0, which we're going to // ignore here. - auto [MInLocs, MOutLocs] = allocValueTables(4, 11); + auto [MInLocs, MOutLocs] = allocValueTables(4, 13); // Transfer function: start with nothing. SmallVector TransferFunc; @@ -1170,7 +1170,7 @@ TEST_F(InstrRefLDVTest, MLocDiamondSpills) { // function. TransferFunc[1].insert({ALStackLoc, ALDefInBlk1}); TransferFunc[1].insert({HAXStackLoc, HAXDefInBlk1}); - initValueArray(MInLocs, 4, 11); + initValueArray(MInLocs, 4, 13); placeMLocPHIs(*MF, AllBlocks, MInLocs, TransferFunc); EXPECT_EQ(MInLocs[3][ALStackLoc.asU64()], ALPHI); EXPECT_EQ(MInLocs[3][AXStackLoc.asU64()], AXPHI); diff --git a/llvm/utils/TableGen/X86RecognizableInstr.cpp b/llvm/utils/TableGen/X86RecognizableInstr.cpp index 26b881651ea41..c6cd3da13646a 100644 --- a/llvm/utils/TableGen/X86RecognizableInstr.cpp +++ b/llvm/utils/TableGen/X86RecognizableInstr.cpp @@ -1162,6 +1162,7 @@ OperandType RecognizableInstr::typeFromString(const std::string &s, TYPE("vz512mem", TYPE_MVSIBZ) TYPE("BNDR", TYPE_BNDR) TYPE("TILE", TYPE_TMM) + TYPE("TILEPair", TYPE_TMM_PAIR) errs() << "Unhandled type string " << s << "\n"; llvm_unreachable("Unhandled type string"); } @@ -1243,6 +1244,7 @@ RecognizableInstr::rmRegisterEncodingFromString(const std::string &s, ENCODING("VK64", ENCODING_RM) ENCODING("BNDR", ENCODING_RM) ENCODING("TILE", ENCODING_RM) + ENCODING("TILEPair", ENCODING_RM) errs() << "Unhandled R/M register encoding " << s << "\n"; llvm_unreachable("Unhandled R/M register encoding"); } @@ -1292,6 +1294,7 @@ RecognizableInstr::roRegisterEncodingFromString(const std::string &s, ENCODING("VK64WM", ENCODING_REG) ENCODING("BNDR", ENCODING_REG) ENCODING("TILE", ENCODING_REG) + ENCODING("TILEPair", ENCODING_REG) errs() << "Unhandled reg/opcode register encoding " << s << "\n"; llvm_unreachable("Unhandled reg/opcode register encoding"); } @@ -1322,6 +1325,7 @@ RecognizableInstr::vvvvRegisterEncodingFromString(const std::string &s, ENCODING("VK32", ENCODING_VVVV) ENCODING("VK64", ENCODING_VVVV) ENCODING("TILE", ENCODING_VVVV) + ENCODING("TILEPair", ENCODING_VVVV) errs() << "Unhandled VEX.vvvv register encoding " << s << "\n"; llvm_unreachable("Unhandled VEX.vvvv register encoding"); } From f822950342402c03b0597b173f38f007c6fa3931 Mon Sep 17 00:00:00 2001 From: "Wang, Phoebe" Date: Tue, 29 Oct 2024 22:04:30 +0800 Subject: [PATCH 2/3] Address review comment --- llvm/include/llvm/CodeGen/TileShapeInfo.h | 2 +- llvm/lib/Target/X86/X86ExpandPseudo.cpp | 2 +- llvm/lib/Target/X86/X86FastPreTileConfig.cpp | 2 +- llvm/lib/TargetParser/Host.cpp | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TileShapeInfo.h b/llvm/include/llvm/CodeGen/TileShapeInfo.h index 0e0a883b0c595..269d8e2964f4f 100644 --- a/llvm/include/llvm/CodeGen/TileShapeInfo.h +++ b/llvm/include/llvm/CodeGen/TileShapeInfo.h @@ -36,7 +36,7 @@ class ShapeT { } // When ShapeT has mult shapes, we only use Shapes (never use Row and Col) // and ImmShapes. Due to the most case is only one shape (just simply use - // Shape.Row or Shape.Col), so here we don't merge Row and Col into vertor + // Shape.Row or Shape.Col), so here we don't merge Row and Col into vector // Shapes to keep the speed and code simplicity. // TODO: The upper solution is a temporary way to minimize current tile // register allocation code changes. It can not handle both Reg shape and diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp index 5584c08a98303..f832955d1202f 100644 --- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -615,7 +615,7 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, MBB.erase(MBBI); return true; } - // Smilar with TILEPAIRLOAD, TILEPAIRSTORE is just for TILEPair spill, no + // Similar with TILEPAIRLOAD, TILEPAIRSTORE is just for TILEPair spill, no // corresponding AMX instruction to support it. So, split it too: // "TILEPAIRSTORE Base, Scale, Index, Offset, Segment, TMM0:TMM1" --> // "TILESTORE Base, Scale, Index, Offset, Segment, TMM0" + diff --git a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp index fd0987db5e4f5..62d0f6ca79434 100644 --- a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp +++ b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp @@ -268,7 +268,7 @@ void X86FastPreTileConfig::reload(MachineBasicBlock::iterator UseMI, << printReg(TileReg, TRI) << '\n'); } -unsigned getTileDefNum(MachineRegisterInfo *MRI, Register Reg) { +static unsigned getTileDefNum(MachineRegisterInfo *MRI, Register Reg) { if (Reg.isVirtual()) { unsigned RegClassID = MRI->getRegClass(Reg)->getID(); if (RegClassID == X86::TILERegClassID) diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp index d48b5029dc740..a9befee9577ec 100644 --- a/llvm/lib/TargetParser/Host.cpp +++ b/llvm/lib/TargetParser/Host.cpp @@ -1875,8 +1875,8 @@ const StringMap sys::getHostCPUFeatures() { MaxLevel >= 0x19 && !getX86CpuIDAndInfo(0x19, &EAX, &EBX, &ECX, &EDX); Features["widekl"] = HasLeaf7 && HasLeaf19 && ((EBX >> 2) & 1); - bool HasLeaf1E = - MaxLevel >= 0x1e && !getX86CpuIDAndInfo(0x1e, &EAX, &EBX, &ECX, &EDX); + bool HasLeaf1E = MaxLevel >= 0x1e && + !getX86CpuIDAndInfoEx(0x1e, 0x1, &EAX, &EBX, &ECX, &EDX); Features["amx-transpose"] = HasLeaf1E && ((EAX >> 5) & 1) && HasAMXSave; bool HasLeaf24 = From 5e762272c39984495a7bc9288f845811f10abd93 Mon Sep 17 00:00:00 2001 From: "Wang, Phoebe" Date: Fri, 1 Nov 2024 15:10:08 +0800 Subject: [PATCH 3/3] Address review comments --- clang/lib/CodeGen/CGBuiltin.cpp | 2 +- clang/lib/Headers/amxtransposeintrin.h | 6 +++--- llvm/include/llvm/CodeGen/TileShapeInfo.h | 6 +++--- llvm/lib/Target/X86/X86LowerAMXType.cpp | 10 +++++----- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 879cf9fa8913e..34fedd6711475 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -17040,7 +17040,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, Value *Store = Builder.CreateDefaultAlignedStore(VecT1, Ops[4]); // Note: Here we escape directly use x86_tilestored64_internal to store - // the results due to it can't make sure the Mem writen scope. This may + // the results due to it can't make sure the Mem written scope. This may // cause shapes reloads after first amx intrinsic, which current amx reg- // ister allocation has no ability to handle it. diff --git a/clang/lib/Headers/amxtransposeintrin.h b/clang/lib/Headers/amxtransposeintrin.h index d5dc68f415284..b3fa37d766c45 100644 --- a/clang/lib/Headers/amxtransposeintrin.h +++ b/clang/lib/Headers/amxtransposeintrin.h @@ -32,7 +32,7 @@ /// \headerfile /// /// \code -/// void __tile_transposed(__tile dst, __tile src); +/// void _tile_transposed(__tile dst, __tile src); /// \endcode /// /// This intrinsic corresponds to the TTRANSPOSED instruction. @@ -40,7 +40,7 @@ /// \param dst /// The destination tile. Max size is 1024 Bytes. /// \param src -/// The 1st source tile. Max size is 1024 Bytes. +/// The source tile. Max size is 1024 Bytes. /// /// \code{.operation} /// @@ -238,7 +238,7 @@ static void __tile_2rpntlvwz1t1(__tile1024i *dst0, __tile1024i *dst1, /// \param dst /// The destination tile. Max size is 1024 Bytes. /// \param src -/// The 1st source tile. Max size is 1024 Bytes. +/// The source tile. Max size is 1024 Bytes. __DEFAULT_FN_ATTRS_TRANSPOSE static void __tile_transposed(__tile1024i *dst, __tile1024i src) { dst->tile = _tile_transposed_internal(dst->row, dst->col, src.tile); diff --git a/llvm/include/llvm/CodeGen/TileShapeInfo.h b/llvm/include/llvm/CodeGen/TileShapeInfo.h index 269d8e2964f4f..24f303a7d9d13 100644 --- a/llvm/include/llvm/CodeGen/TileShapeInfo.h +++ b/llvm/include/llvm/CodeGen/TileShapeInfo.h @@ -34,14 +34,14 @@ class ShapeT { if (MRI) deduceImm(MRI); } - // When ShapeT has mult shapes, we only use Shapes (never use Row and Col) + // When ShapeT has multiple shapes, we only use Shapes (never use Row and Col) // and ImmShapes. Due to the most case is only one shape (just simply use // Shape.Row or Shape.Col), so here we don't merge Row and Col into vector // Shapes to keep the speed and code simplicity. // TODO: The upper solution is a temporary way to minimize current tile // register allocation code changes. It can not handle both Reg shape and // Imm shape for different shapes (e.g. shape 1 is reg shape while shape 2 - // is imm shape). Refine me when we have more mult-tile shape instructions! + // is imm shape). Refine me when we have more multi-tile shape instructions! ShapeT(ArrayRef ShapesOperands, const MachineRegisterInfo *MRI = nullptr) : Row(nullptr), Col(nullptr), RowImm(InvalidImmShape), @@ -57,7 +57,7 @@ class ShapeT { ShapeT() : Row(nullptr), Col(nullptr), RowImm(InvalidImmShape), ColImm(InvalidImmShape) {} - // TODO: We need to extern cmp operator for muti-shapes if + // TODO: We need to extern cmp operator for multi-shapes if // we have requirement in the future. bool operator==(const ShapeT &Shape) const { MachineOperand *R = Shape.Row; diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp index b9d811e91ab5a..688e886cf3b13 100644 --- a/llvm/lib/Target/X86/X86LowerAMXType.cpp +++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp @@ -551,7 +551,7 @@ static Instruction *createTileStore(Instruction *TileDef, Value *Ptr) { assert(TileDef->getType()->isX86_AMXTy() && "Not define tile!"); auto *II = dyn_cast(TileDef); unsigned Idx = 0; - // Extract tile from mult tiles' def. + // Extract tile from multiple tiles' def. if (auto *Extr = dyn_cast(TileDef)) { assert(Extr->hasIndices() && "Tile extract miss index!"); Idx = Extr->getIndices()[0]; @@ -584,7 +584,7 @@ static void replaceWithTileLoad(Use &U, Value *Ptr, bool IsPHI = false) { Value *PhiOp = cast(V)->getIncomingValue(0); II = cast(PhiOp); } else if (auto *Extr = dyn_cast(V)) { - // Extract tile from mult tiles' def. + // Extract tile from multiple tiles' def. assert(Extr->hasIndices() && "Tile extract miss index!"); Idx = Extr->getIndices()[0]; II = cast(Extr->getOperand(0)); @@ -1040,7 +1040,7 @@ bool X86LowerAMXCast::combineCastStore(IntrinsicInst *Cast, StoreInst *ST) { assert(Tile->getType()->isX86_AMXTy() && "Not Tile Operand!"); - // TODO: Specially handle the mult-use case. + // TODO: Specially handle the multi-use case. if (Tile->getNumUses() != 1) return false; @@ -1057,8 +1057,8 @@ bool X86LowerAMXCast::combineCastStore(IntrinsicInst *Cast, StoreInst *ST) { Row = II->getOperand(0); Col = II->getOperand(1); } else { - // Now we supported mult-tiles value in structure, so we may get tile - // from extracting mult-tiles structure. + // Now we supported multi-tiles value in structure, so we may get tile + // from extracting multi-tiles structure. // For example: // %6 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %1, // i16 %2, i16 %3, i8* %4, i64 %5)