From c37453c8477448cf10a349ff0c4bb5e1fea8fb27 Mon Sep 17 00:00:00 2001 From: Srinivasa Ravi Date: Mon, 24 Feb 2025 19:37:31 +0530 Subject: [PATCH] [NVPTX] Add intrinsics for st.bulk instruction Adds NVVM intrinsics and NVPTX codegen for the `st.bulk` instruction introduced in ptx8.6 for sm_100. Tests added in `CodeGen/NVPTX/st_bulk.ll` and verified through ptxas 12.8.0. PTX Spec Reference: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-st-bulk --- llvm/docs/NVPTXUsage.rst | 32 +++++++++++++++++ llvm/include/llvm/IR/IntrinsicsNVVM.td | 14 ++++++++ llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 14 ++++++++ llvm/test/CodeGen/NVPTX/st_bulk.ll | 46 ++++++++++++++++++++++++ 4 files changed, 106 insertions(+) create mode 100644 llvm/test/CodeGen/NVPTX/st_bulk.ll diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index 0403bbc1eabb7..6339146b5cc7a 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -1407,6 +1407,38 @@ The last argument `i1 %unpack` is a compile-time constant which when set, indica For more information, refer to the `PTX ISA `__. +Store Intrinsics +---------------- + +'``llvm.nvvm.st.bulk.*``' +^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.st.bulk(ptr addrspace(1) %dst, i64 %size, i64 immarg %initval) + declare void @llvm.nvvm.st.bulk.shared.cta(ptr addrspace(3) %dst, i64 %size, i64 immarg %initval) + +Overview: +""""""""" + +The '``@llvm.nvvm.st.bulk.*``' intrinsics initialize a region of shared memory +starting from the location specified by the destination address operand `%dst`. + +The integer operand `%size` specifies the amount of memory to be initialized in +terms of number of bytes and must be a multiple of 8. Otherwise, the behavior +is undefined. + +The integer immediate operand `%initval` specifies the initialization value for +the memory locations. The only numeric value allowed is 0. + +The ``@llvm.nvvm.st.bulk.shared.cta`` and ``@llvm.nvvm.st.bulk`` intrinsics are +similar but the latter uses generic addressing (see `Generic Addressing `__). + +For more information, refer `PTX ISA `__. + Other Intrinsics ---------------- diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 72b05bca08940..19b7a9b3f711c 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -5261,4 +5261,18 @@ foreach shape = ["16x64b", "16x128b", "16x256b", "32x32b", "16x32bx2"] in { } } +// +// Bulk store intrinsics +// + +def int_nvvm_st_bulk: DefaultAttrsIntrinsic<[], + [llvm_global_ptr_ty, llvm_i64_ty, llvm_i64_ty], + [IntrArgMemOnly, IntrWriteMem, + WriteOnly>, NoCapture>, ImmArg>]>; + +def int_nvvm_st_bulk_shared_cta : DefaultAttrsIntrinsic<[], + [llvm_shared_ptr_ty, llvm_i64_ty, llvm_i64_ty], + [IntrArgMemOnly, IntrWriteMem, + WriteOnly>, NoCapture>, ImmArg>]>; + } // let TargetPrefix = "nvvm" diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 7d7e69adafcd0..0a5e711a9726a 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -7766,3 +7766,17 @@ foreach shape = ["16x64b", "16x128b", "16x256b", "32x32b", "16x32bx2"] in { } } // isConvergent + +// Bulk store instructions + +def INT_NVVM_ST_BULK_GENERIC : + NVPTXInst<(outs), (ins ADDR:$dest_addr, Int64Regs:$size), + "st.bulk [$dest_addr], $size, 0;", + [(int_nvvm_st_bulk addr:$dest_addr, i64:$size, (i64 0))]>, + Requires<[hasSM<100>, hasPTX<86>]>; + +def INT_NVVM_ST_BULK_SHARED_CTA: + NVPTXInst<(outs), (ins ADDR:$dest_addr, Int64Regs:$size), + "st.bulk.shared::cta [$dest_addr], $size, 0;", + [(int_nvvm_st_bulk_shared_cta addr:$dest_addr, i64:$size, (i64 0))]>, + Requires<[hasSM<100>, hasPTX<86>]>; diff --git a/llvm/test/CodeGen/NVPTX/st_bulk.ll b/llvm/test/CodeGen/NVPTX/st_bulk.ll new file mode 100644 index 0000000000000..085df7f1d8d3f --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/st_bulk.ll @@ -0,0 +1,46 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 | FileCheck --check-prefixes=CHECK,CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr | FileCheck --check-prefixes=CHECK,CHECK-PTX-SHARED32 %s +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 | %ptxas-verify -arch=sm_100 %} +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_100 %} + +declare void @llvm.nvvm.st.bulk(ptr addrspace(1), i64, i64) +define void @st_bulk(ptr addrspace(1) %dest_addr, i64 %size) { +; CHECK-LABEL: st_bulk( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [st_bulk_param_0]; +; CHECK-NEXT: ld.param.u64 %rd2, [st_bulk_param_1]; +; CHECK-NEXT: st.bulk [%rd1], %rd2, 0; +; CHECK-NEXT: ret; + call void @llvm.nvvm.st.bulk(ptr addrspace(1) %dest_addr, i64 %size, i64 0) + ret void +} + +declare void @llvm.nvvm.st.bulk.shared.cta(ptr addrspace(3), i64, i64) +define void @st_bulk_shared_cta(ptr addrspace(3) %dest_addr, i64 %size) { +; CHECK-PTX64-LABEL: st_bulk_shared_cta( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [st_bulk_shared_cta_param_0]; +; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [st_bulk_shared_cta_param_1]; +; CHECK-PTX64-NEXT: st.bulk.shared::cta [%rd1], %rd2, 0; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: st_bulk_shared_cta( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<2>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<2>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [st_bulk_shared_cta_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [st_bulk_shared_cta_param_1]; +; CHECK-PTX-SHARED32-NEXT: st.bulk.shared::cta [%r1], %rd1, 0; +; CHECK-PTX-SHARED32-NEXT: ret; + call void @llvm.nvvm.st.bulk.shared.cta(ptr addrspace(3) %dest_addr, i64 %size, i64 0) + ret void +}