llvm · schwarzschild-radius · Feb 27, 2025 · May 14, 2024
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
@@ -1271,6 +1271,101 @@ For more information on the decompression schemes, refer to the PTX ISA
 For more information on the tcgen05.cp instruction, refer to the PTX ISA
 `<https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-cp>`_.
 
+'``llvm.nvvm.tcgen05.ld.*``'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+  declare <n x i32> @llvm.nvvm.tcgen05.ld.<shape>.<num>(ptr addrspace(6) %tmem_addr, i1 %pack)
+
+  declare <n x i32> @llvm.nvvm.tcgen05.ld.16x32bx2.<num>(ptr addrspace(6) %tmem_addr, i64 %offset, i1 %pack)
+
+Overview:
+"""""""""
+
+This group of intrinsics asynchronously load data from the Tensor Memory at the location specified
+by the 32-bit address operand `tmem_addr` into the destination registers, collectively across all threads
+of the warps.
+
+All the threads in the warp must specify the same value of `tmem_addr`, which must be the base address
+of the collective load operation. Otherwise, the behavior is undefined.
+
+The `shape` qualifier and the `num` qualifier together determines the total dimension of the data ('n') which
+is loaded from the Tensor Memory. The `shape` qualifier indicates the base dimension of data. The `num` qualifier
+indicates the repeat factor on the base dimension resulting in the total dimension of the data that is accessed.
+
+Allowed values for the 'num' are `x1, x2, x4, x8, x16, x32, x64, x128`.
+
+Allowed values for the 'shape' in the first intrinsic are `16x64b, 16x128b, 16x256b, 32x32b`.
+
+Allowed value for the 'shape' in the second intrinsic is `16x32bx2`.
+
+The result of the intrinsic is a vector consisting of one or more 32-bit registers derived from `shape` and
+`num` as shown below.
+
+=========== =========================  ==========  ==========
+ num/shape     16x32bx2/16x64b/32x32b    16x128b    16x256b
+=========== =========================  ==========  ==========
+ x1                 1                      2           4
+ x2                 2                      4           8
+ x4                 4                      8           16
+ x8                 8                      16          32
+ x16                16                     32          64
+ x32                32                     64          128
+ x64                64                     128         NA
+ x128               128                    NA          NA
+=========== =========================  ==========  ==========
+
+The last argument `i1 %pack` is a compile-time constant which when set, indicates that the adjacent columns are packed into a single 32-bit element during the load
+
+For more information, refer to the
+`PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-ld>`__.
+
+
+'``llvm.nvvm.tcgen05.st.*``'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+  declare void @llvm.nvvm.tcgen05.st.<shape>.<num>(ptr addrspace(6) %tmem_addr, <n x i32> %args, i1 %unpack)
+
+  declare void @llvm.nvvm.tcgen05.st.16x32bx2.<num>(ptr addrspace(6) %tmem_addr, <n x i32> %args, i64 %offset, i1 %unpack)
+
+Overview:
+"""""""""
+
+This group of intrinsics asynchronously store data from the source vector into the Tensor Memory at the location
+specified by the 32-bit address operand 'tmem_addr` collectively across all threads of the warps.
+
+All the threads in the warp must specify the same value of `tmem_addr`, which must be the base address of the
+collective load operation. Otherwise, the behavior is undefined.
+
+The `shape` qualifier and the `num` qualifier together determines the total dimension of the data ('n') which
+is loaded from the Tensor Memory. The `shape` qualifier indicates the base dimension of data. The `num` qualifier
+indicates the repeat factor on the base dimension resulting in the total dimension of the data that is accessed.
+
+Allowed values for the 'num' are `x1, x2, x4, x8, x16, x32, x64, x128`.
+
+Allowed values for the 'shape' in the first intrinsic are `16x64b, 16x128b, 16x256b, 32x32b`.
+
+Allowed value for the 'shape' in the second intrinsic is `16x32bx2`.
+
+`args` argument is a vector consisting of one or more 32-bit registers derived from `shape` and
+`num` as listed in the table listed in the `tcgen05.ld` section.
+
+Each shape support an `unpack` mode to allow a 32-bit element in the register to be unpacked into two 16-bit elements and store them in adjacent columns. `unpack` mode can be enabled by setting the `%unpack` operand to 1 and can be disabled by setting it to 0.
+
+The last argument `i1 %unpack` is a compile-time constant which when set, indicates that a 32-bit element in the register to be unpacked into two 16-bit elements and store them in adjacent columns.
+
+For more information, refer to the
+`PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-st>`__.
+
 Other Intrinsics
 ----------------
 

diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
@@ -564,6 +564,7 @@ def llvm_v8i32_ty      : LLVMType<v8i32>;    //  8 x i32
 def llvm_v16i32_ty     : LLVMType<v16i32>;   // 16 x i32
 def llvm_v32i32_ty     : LLVMType<v32i32>;   // 32 x i32
 def llvm_v64i32_ty     : LLVMType<v64i32>;   // 64 x i32
+def llvm_v128i32_ty    : LLVMType<v128i32>;  //128 x i32
 def llvm_v256i32_ty    : LLVMType<v256i32>;  //256 x i32
 
 def llvm_v1i64_ty      : LLVMType<v1i64>;    //  1 x i64

diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -664,6 +664,35 @@ class CP_ASYNC_BULK_TENSOR_REDUCE_INTR<int dim, string mode, string op> {
         ImmArg<ArgIndex<FlagsStartIdx>>];
 }
 
+class NVVM_TCGEN05_LDST_NAME<string Op, string Shape, int Num> {
+  string intr =   "llvm.nvvm.tcgen05." # Op
+                  # "." # Shape
+                  # "." # "x" # !shl(1, Num);
+
+  string record = !subst(".", "_",
+                  !subst("llvm.", "int_", intr));
+}
+
+
+class NVVM_TCGEN05_LDST_ACCESS_SIZE<string Shape, int Num> {
+  int shift = !cond(!eq(Shape, "16x128b"): 1,
+                    !eq(Shape, "16x256b"): 2,
+                    true : 0);
+
+  int veclen = !shl(1, !add(Num, shift));
+
+  int valid = !le(veclen, 128);
+  LLVMType type = !cond(!eq(veclen,   1): llvm_i32_ty,
+                        !eq(veclen,   2): llvm_v2i32_ty,
+                        !eq(veclen,   4): llvm_v4i32_ty,
+                        !eq(veclen,   8): llvm_v8i32_ty,
+                        !eq(veclen,  16): llvm_v16i32_ty,
+                        !eq(veclen,  32): llvm_v32i32_ty,
+                        !eq(veclen,  64): llvm_v64i32_ty,
+                        !eq(veclen, 128): llvm_v128i32_ty,
+                        true : llvm_void_ty);
+}
+
 let TargetPrefix = "nvvm" in {
   def int_nvvm_prmt : ClangBuiltin<"__nvvm_prmt">,
       DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
@@ -5186,4 +5215,40 @@ foreach cta_group = ["cg1", "cg2"] in {
   }
 }
 
+// Tcgen05 ld intrinsics
+class NVVM_TCGEN05_LD<string Shape, int Num> :
+        Intrinsic<[NVVM_TCGEN05_LDST_ACCESS_SIZE<Shape, Num>.type],
+                  !listconcat([llvm_tmem_ptr_ty],
+                              !if(!eq(Shape, "16x32bx2"), [llvm_i64_ty], []),
+                              [llvm_i1_ty]),
+                  !listconcat([IntrConvergent, IntrArgMemOnly, NoCapture<ArgIndex<0>>],
+                              !if(!eq(Shape, "16x32bx2"),
+                                [ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>],
+                                [ImmArg<ArgIndex<1>>])),
+                  NVVM_TCGEN05_LDST_NAME<"ld", Shape, Num>.intr>;
+
+// Tcgen05 st intrinsics
+class NVVM_TCGEN05_ST<string Shape, int Num> :
+        Intrinsic<[],
+                  !listconcat([llvm_tmem_ptr_ty],
+                              !if(!eq(Shape, "16x32bx2"), [llvm_i64_ty], []),
+                              [NVVM_TCGEN05_LDST_ACCESS_SIZE<Shape, Num>.type],
+                              [llvm_i1_ty]),
+                  !listconcat([IntrConvergent, IntrArgMemOnly, NoCapture<ArgIndex<0>>],
+                              !if(!eq(Shape, "16x32bx2"),
+                                [ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<3>>],
+                                [ImmArg<ArgIndex<2>>])),
+                  NVVM_TCGEN05_LDST_NAME<"st", Shape, Num>.intr>;
+
+foreach shape = ["16x64b", "16x128b", "16x256b", "32x32b", "16x32bx2"] in {
+  foreach num = !range(0, 8) in {
+    if NVVM_TCGEN05_LDST_ACCESS_SIZE<shape, num>.valid then {
+      def NVVM_TCGEN05_LDST_NAME<"ld", shape, num>.record:
+            NVVM_TCGEN05_LD<shape, num>;
+      def NVVM_TCGEN05_LDST_NAME<"st", shape, num>.record:
+            NVVM_TCGEN05_ST<shape, num>;
+      }
+  }
+}
+
 } // let TargetPrefix = "nvvm"