diff --git a/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td b/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td index 172ef94b3bd1a..34c7097883271 100644 --- a/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td +++ b/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td @@ -86,6 +86,8 @@ def AspectExt_oneapi_cuda_cluster_group : Aspect<"ext_oneapi_cuda_cluster_group" def AspectExt_intel_fpga_task_sequence : Aspect<"ext_intel_fpga_task_sequence">; def AspectExt_oneapi_atomic16 : Aspect<"ext_oneapi_atomic16">; def AspectExt_oneapi_virtual_functions : Aspect<"ext_oneapi_virtual_functions">; +def AspectExt_codeplay_cuda_tensor_map : Aspect<"ext_codeplay_cuda_tensor_map">; + // Deprecated aspects def AspectInt64_base_atomics : Aspect<"int64_base_atomics">; def AspectInt64_extended_atomics : Aspect<"int64_extended_atomics">; @@ -150,7 +152,9 @@ def : TargetInfo<"__TestAspectList", AspectExt_oneapi_queue_profiling_tag, AspectExt_oneapi_virtual_mem, AspectExt_oneapi_cuda_cluster_group, AspectExt_intel_fpga_task_sequence, AspectExt_oneapi_atomic16, - AspectExt_oneapi_virtual_functions], + AspectExt_oneapi_virtual_functions, + AspectExt_codeplay_cuda_tensor_map, + ], []>; // This definition serves the only purpose of testing whether the deprecated aspect list defined in here and in SYCL RT // match. @@ -265,9 +269,21 @@ def : CudaTargetInfo<"nvidia_gpu_sm_87", !listconcat(CudaMinAspects, CudaBindles def : CudaTargetInfo<"nvidia_gpu_sm_89", !listconcat(CudaMinAspects, CudaBindlessImagesAspects, [AspectFp16, AspectAtomic64, AspectExt_oneapi_cuda_async_barrier])>; def : CudaTargetInfo<"nvidia_gpu_sm_90", !listconcat(CudaMinAspects, CudaBindlessImagesAspects, - [AspectFp16, AspectAtomic64, AspectExt_oneapi_cuda_async_barrier, AspectExt_oneapi_cuda_cluster_group])>; + [ + AspectFp16, + AspectAtomic64, + AspectExt_oneapi_cuda_async_barrier, + AspectExt_oneapi_cuda_cluster_group, + AspectExt_codeplay_cuda_tensor_map, + ])>; def : CudaTargetInfo<"nvidia_gpu_sm_90a", !listconcat(CudaMinAspects, CudaBindlessImagesAspects, - [AspectFp16, AspectAtomic64, AspectExt_oneapi_cuda_async_barrier, AspectExt_oneapi_cuda_cluster_group])>; + [ + AspectFp16, + AspectAtomic64, + AspectExt_oneapi_cuda_async_barrier, + AspectExt_oneapi_cuda_cluster_group, + AspectExt_codeplay_cuda_tensor_map, + ])>; // // HIP / AMDGPU device aspects diff --git a/sycl/doc/extensions/experimental/sycl_ext_codeplay_cuda_tensor_map.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_codeplay_cuda_tensor_map.asciidoc new file mode 100644 index 0000000000000..3f89b734915e3 --- /dev/null +++ b/sycl/doc/extensions/experimental/sycl_ext_codeplay_cuda_tensor_map.asciidoc @@ -0,0 +1,367 @@ += sycl_ext_codeplay_cuda_tensor_map + +:source-highlighter: coderay +:coderay-linenums-mode: table + +// This section needs to be after the document title. +:doctype: book +:toc2: +:toc: left +:encoding: utf-8 +:lang: en +:dpcpp: pass:[DPC++] +:cuda-guide-using-tma: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#using-tma-to-transfer-multi-dimensional-arrays +:cuda-guide-async-copies: https://docs.nvidia.com/cuda/cuda-c-programming-guide/#asynchronous-data-copies-using-tensor-memory-access-tma + +// Set the default source code type in this document to C++, +// for syntax highlighting purposes. This is needed because +// docbook uses c++ and html5 uses cpp. +:language: {basebackend@docbook:c++:cpp} + +== Notice + +[%hardbreaks] +Copyright (C) Codeplay Software Limited. All rights reserved. + +Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks +of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. used by +permission by Khronos. + +== Contact + +To report problems with this extension, please open a new issue at: + +https://github.com/intel/llvm/issues + +== Dependencies + +This extension is written against the SYCL 2020 revision 9 specification. All +references below to the "core SYCL specification" or to section numbers in the +SYCL specification refer to that revision. + +== Status + +This is an experimental extension specification, intended to provide early +access to features and gather community feedback. Interfaces defined in +this specification are implemented in {dpcpp}, but they are not finalized +and may change incompatibly in future versions of {dpcpp} without prior notice. +*Shipping software products should not rely on APIs defined in +this specification.* + +[NOTE] +==== +This extension is currently implemented in {dpcpp} only for NVIDIA GPU devices +with Compute Capability of 9.0 or above and only when using the CUDA backend. + +==== + +== Introduction + +This document describes an extension that adds interfaces enabling OneAPI +implementers access to CUDA's Tensor Map Access (TMA) APIs from within SYCL +kernels. These interfaces provide utilities to enable accelerated copies of +multidimensional arrays of various types. There is no novelty here; only the +plumbing needed to access accelerated features. + +== Specification + +=== Feature Test Macro + +This extension provides a feature-test macro as described in the core SYCL +specification section 6.3.3 "Feature test macros". Therefore, an +implementation supporting this extension must predefine the macro +`SYCL_EXT_CODEPLAY_CUDA_TENSOR_MAP` to one of the values defined in the table +below. Applications can test for the existence of this macro to determine if +the implementation supports this feature, or applications can test the macro's +value to determine which of the extension's APIs the implementation supports. + +[%header,cols="1,5"] +|=== +|Value +|Description + +|1 +|Initial version of this extension +|=== + +=== Overview + +Certain libraries shipped with OneAPI may need access to proprietary accelerator +extensions to enable good performance via use of driver-provided features that +are beyond the reach of the compiler or language model. One such extension is +CUDA's link:{cuda-guide-async-copies}[Tensor Memory Access] interface which +enables accelerated copies of multidimensional arrays. This is *not* a feature +that can be implemented in the compiler directly. This is because the parameters +for the tensor copy adhere to an unusual ABI and the context type used for +initializing such operations requires a call to the driver from the host. The +initialized data can then *only* be acted upon within the device context. Thus: +gaining access to accelerated "tensor" copies within SYCL requires interfaces +that emulates this pattern in such a way it's possible to gain access to the +user data from within kernels. + +[NOTE] +==== +These interfaces are for initializing asynchronous *multidimensional* array +copies only. One dimensional array copies should be preferably performed using +standard sycl memcpy features. + +==== + +==== Interface + +`sycl_ext_codeplay_cuda_tensor_map` defines two classes and a number of +enumerations for initializing the CUDA TMA context objects + +classes: + +- `tiled_encode_map` +- `im2col_encode_map` + +enumerations + +- `datatype` +- `interleave` +- `swizzle` +- `l2_promote` +- `oob_fill` + +They are analogous to their CUDA namesakes. The size, alignment and layout of +the structs are unspecified. + +Each class has a single constructor whose parameters control the tensor copy +operation implied by the class name. + + +The `tiled_encode_map` class is used to initialize CUDA state used for tiled +copies of multidimensional arrays +Its arguments are analogous to its CUDA namesake +link:{cuda-guide-using-tma}[c.f. CUDA programming guide] + + tiled_encode_map(queue &q, void *addr, datatype type, uint32_t rank, + const uint64_t global_dims[/*rank*/], + const uint64_t global_strides[/*rank - 1*/], + const uint32_t box_dims[/*rank*/], + const uint32_t element_strides[/*rank*/], + interleave interleave, swizzle swizzle, l2_promote promote, + oob_fill oob_fill); + +The `im2col_encode_map` class is used to initialize CUDA state used for +asynchronous copies with a re-encoding of blocks to columns. +Its arguments are analagous to its CUDA namesake +link:{cuda-guide-using-tma}[c.f CUDA programming guide] + +When passed to a kernel by value, either of these class objects can then have +their address taken and passed as the second operand of the +`cp.async.bulk.tensor` family of PTX instructions via inline assembly. +No other operation is supported. + +These objects can be constructed in host code only. It is undefined to attempt +to construct them in a kernel. Using the address of copied objects for the +CUDA tensor operations is undefined. + +==== Sample Header for host code only + +[source, c++] +---- +namespace sycl::ext::codeplay::experimental::cuda { + enum datatype : int { + type_uint8, + type_uint16, + type_uint32, + type_int32, + type_uint64, + type_int64, + type_float16, + type_float32, + type_float64, + type_bfloat16, + type_float32_ftz, + type_tfloat32, + type_tfloat32_ftz, + }; + enum interleave : int { + interleave_none, + interleave_16, + interleave_32, + }; + enum swizzle : int { + swizzle_none, + swizzle_32, + swizzle_64, + swizzle_128, + }; + enum l2_promote : int { + promote_none, + promote_l2_64, + promote_l2_128, + promote_l2_256, + }; + enum oob_fill : int { + oob_fill_none, + oob_fill_nan_request_zero_fma, + }; +struct tiled_encode_map { + tiled_encode_map(queue &q, void *addr, datatype type, uint32_t rank, + const uint64_t global_dims[/*rank*/], + const uint64_t global_strides[/*rank - 1*/], + const uint32_t box_dims[/*rank*/], + const uint32_t element_strides[/*rank*/], + interleave interleave, swizzle swizzle, l2_promote promote, + oob_fill oob_fill); +private: +// Implementation defined members must be private +}; + +struct im2col_encode_map { + enum datatype : int { + type_uint8, + type_uint16, + type_uint32, + type_int32, + type_uint64, + type_int64, + type_float16, + type_float32, + type_float64, + type_bfloat16, + type_float32_ftz, + type_tfloat32, + type_tfloat32_ftz, + }; + enum interleave : int { + interleave_none, + interleave_16, + interleave_32, + }; + enum swizzle : int { + swizzle_none, + swizzle_32, + swizzle_64, + swizzle_128, + }; + enum l2_promote : int { + promote_none, + promote_l2_64, + promote_l2_128, + promote_l2_256, + }; + enum oob_fill : int { + oob_fill_none, + oob_fill_nan_request_zero_fma, + }; +// Implementation defined members must be private + im2col_encode_map(queue &q, datatype type, uint32_t rank, void *addr, + const uint64_t gmem_dims[/*rank*/], + const uint64_t gmem_strides[/*rank - 1*/], + const int32_t pixel_box_lower_corner[/*rank*/], + const int32_t pixel_box_upper_corner[/*rank*/], + uint32_t channels_per_pixel, uint32_t pixels_per_col, + const uint32_t element_strides[/*rank*/], + interleave interleave, swizzle swizzle, l2_promote promote, + oob_fill oob_fill); +}; +} +---- + +==== Sample Header for device code only + +[source, c++] +---- +namespace sycl::ext::codeplay::experimental::cuda { +class tiled_encode_map { +public: + // Get access to the TMA descriptor for use as an operand to the + // cp.async.bulk.tensor family of PTX instructions + uintptr_t get_native_descriptor(); +}; +class im2col_encode_map { + // Get access to the TMA descriptor for use as an operand to the + // cp.async.bulk.tensor family of PTX instructions + uintptr_t get_native_descriptor(); +}; +} +---- + +== Examples + +[source, c++] +---- +#include +#include + +#include +#include +#include +#include + +using namespace sycl; +using namespace sycl::ext::codeplay::experimental::cuda; +namespace sycl_ext = sycl::ext::oneapi::experimental; +#define rank 2 + +#define WIDTH (256) +#define HEIGHT (8) +int main() { + device cuda_dev{ + [](const sycl::device &dev) { + return dev.get_backend() == sycl::backend::ext_oneapi_cuda ? 1 : -1; + } + }; + bool has_aspect = cuda_dev.has(aspect::ext_codeplay_cuda_tensor_map); + assert(has_aspect); + queue q{cuda_dev}; + auto *mem = sycl::malloc_device(WIDTH * HEIGHT, q); + + uint64_t global_dims[rank] = {WIDTH, HEIGHT}; + uint64_t global_strides[rank - 1] = {WIDTH}; + uint32_t box_dims[rank] = {WIDTH / 2, HEIGHT / 2}; + uint32_t element_strides[rank] = {1, 1}; + + tiled_encode_map tile( + q, + static_cast(mem), + tiled_encode_map::datatype::type_int32, + rank, + global_dims, + global_strides, + box_dims, + element_strides, + tiled_encode_map::interleave::interleave_none, + tiled_encode_map::swizzle::swizzle_none, + tiled_encode_map::l2_promote::promote_none, + tiled_encode_map::oob_fill::oob_fill_none + ); + + q.submit([&](handler &Cgh) { + sycl_ext::work_group_scratch_size static_size{WIDTH * HEIGHT * sizeof (int32_t)}; + sycl_ext::properties properties{static_size}; + cgh.parallel_for(nd_range<1>(range<1>(Size), range<1>(WgSize)), properties, + [=](nd_item<1> Item) { + sycl_ext::work_group_static barrier_mem; + auto smem_ptr = reinterpret_cast(sycl_ext::static_address_cast< + sycl::access::address_space::local_space>(sycl_ext::get_work_group_scratch_memory()).get_decorated()); + auto bar_ptr = reinterpret_cast(sycl_ext::static_address_cast< + sycl::access::address_space::local_space>(&barrier_mem).get_decorated()) + (void)tile; + (void)shmem; +#ifdef __SYCL_DEVICE_ONLY__ + uint32_t smem_int_bar = 0; + int32_t tc0 = 0; + int32_t tc1 = 0; + asm volatile ( + "cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes" + " [%[smem_int_ptr]], [%[tma_descriptor], {%[tc0], %[tc1]}], [%[bar_ptr]];" + : + : [smem_int_ptr] "r" (smem_ptr), + [tma_descriptor] "l" (tile.get_native_descriptor()), + [bar_ptr] "r" (bar_ptr), + [tc0] "r" (tc0), + [tc1] "r" (tc1) + : "memory" + ); +#endif + }); + // Do stuff with shared memory now... + }).wait(); +} +---- diff --git a/sycl/include/sycl/ext/codeplay/experimental/cuda_tensor_map.hpp b/sycl/include/sycl/ext/codeplay/experimental/cuda_tensor_map.hpp new file mode 100644 index 0000000000000..89b477e336c53 --- /dev/null +++ b/sycl/include/sycl/ext/codeplay/experimental/cuda_tensor_map.hpp @@ -0,0 +1,117 @@ +//==----------------- tensor_map.hpp --- CUDA TMA interop wrappers ---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +#include + +#define SYCL_EXT_CODEPLAY_CUDA_TENSOR_MAP 1 + +namespace sycl { +inline namespace _V1 { +class queue; +namespace ext::codeplay::experimental::cuda { +namespace detail { +/// An opaque type passed to the runtime used to describe the properties of an +/// image. + +struct alignas(64) __tensor_copy_descriptor { +protected: + unsigned char data[128]; + +public: + // It'd be nice to shorten these enumeration names a little, but since many of + // them start with numbers, that'd be an illegal name, and nobody is going to + // prefer typing `tensor_copy_descriptor::interleave::sixteen` over + // `tensor_copy_descriptor::interleave_16`. Additionally, naming the type + // enumerations after the type they represent is sketchy since there are so + // many variations of uint32 et al in the wild. Thus: in the name of + // consistency all enumerators here duplicatively encode the type in their + // names + enum datatype : int { + type_uint8, + type_uint16, + type_uint32, + type_int32, + type_uint64, + type_int64, + type_float16, + type_float32, + type_float64, + type_bfloat16, + type_float32_ftz, + type_tfloat32, + type_tfloat32_ftz, + }; + enum interleave : int { + interleave_none, + interleave_16, + interleave_32, + }; + enum swizzle : int { + swizzle_none, + swizzle_32, + swizzle_64, + swizzle_128, + }; + enum l2_promote : int { + promote_none, + promote_l2_64, + promote_l2_128, + promote_l2_256, + }; + enum oob_fill : int { + oob_fill_none, + oob_fill_nan_request_zero_fma, + }; +}; +} // namespace detail + +struct __SYCL_EXPORT tiled_encode_map final + : public detail::__tensor_copy_descriptor { + tiled_encode_map() = delete; + // Can't be constructed on device, only passed into kernels from the host + tiled_encode_map(queue &q, void *addr, datatype type, uint32_t rank, + const uint64_t global_dims[/*rank*/], + const uint64_t global_strides[/*rank - 1*/], + const uint32_t box_dims[/*rank*/], + const uint32_t element_strides[/*rank*/], + interleave interleave, swizzle swizzle, l2_promote promote, + oob_fill oob_fill); +#ifdef __SYCL_DEVICE_ONLY__ + uintptr_t get_native_descriptor() const { + return reinterpret_cast(this); + } +#endif +}; + +struct __SYCL_EXPORT im2col_encode_map final + : public detail::__tensor_copy_descriptor { + im2col_encode_map() = delete; + // Can't be constructed on device, only passed into kernels from the host + im2col_encode_map(queue &q, datatype type, uint32_t rank, void *addr, + const uint64_t gmem_dims[/*rank*/], + const uint64_t gmem_strides[/*rank - 1*/], + const int32_t pixel_box_lower_corner[/*rank*/], + const int32_t pixel_box_upper_corner[/*rank*/], + uint32_t channels_per_pixel, uint32_t pixels_per_col, + const uint32_t element_strides[/*rank*/], + interleave interleave, swizzle swizzle, l2_promote promote, + oob_fill oob_fill); +#ifdef __SYCL_DEVICE_ONLY__ + uintptr_t get_native_descriptor() const { + return reinterpret_cast(this); + } +#endif +}; + +} // namespace ext::codeplay::experimental::cuda +} // namespace _V1 +} // namespace sycl diff --git a/sycl/include/sycl/info/aspects.def b/sycl/include/sycl/info/aspects.def index 8a931dde35a71..fe0072d3ea473 100644 --- a/sycl/include/sycl/info/aspects.def +++ b/sycl/include/sycl/info/aspects.def @@ -72,3 +72,4 @@ __SYCL_ASPECT(ext_oneapi_bindless_images_sample_1d_usm, 78) __SYCL_ASPECT(ext_oneapi_bindless_images_sample_2d_usm, 79) __SYCL_ASPECT(ext_oneapi_atomic16, 80) __SYCL_ASPECT(ext_oneapi_virtual_functions, 81) +__SYCL_ASPECT(ext_codeplay_cuda_tensor_map, 82) diff --git a/sycl/source/CMakeLists.txt b/sycl/source/CMakeLists.txt index 7bb35a9e158cd..c42fc870d4fa8 100644 --- a/sycl/source/CMakeLists.txt +++ b/sycl/source/CMakeLists.txt @@ -285,6 +285,7 @@ set(SYCL_COMMON_SOURCES "detail/scheduler/graph_builder.cpp" "detail/spec_constant_impl.cpp" "detail/sycl_mem_obj_t.cpp" + "detail/cuda_tensor_map.cpp" "detail/usm/usm_impl.cpp" "detail/ur.cpp" "detail/util.cpp" diff --git a/sycl/source/detail/cuda_tensor_map.cpp b/sycl/source/detail/cuda_tensor_map.cpp new file mode 100644 index 0000000000000..1ab5e18a6bb52 --- /dev/null +++ b/sycl/source/detail/cuda_tensor_map.cpp @@ -0,0 +1,198 @@ +//==----------------- tensor_map.cpp --- CUDA TMA interop wrappers ---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +#include "detail/device_impl.hpp" +#include +#include +#include +#include + +namespace { +using tcd = + sycl::ext::codeplay::experimental::cuda::detail::__tensor_copy_descriptor; + +static inline ur_device_handle_t get_ur_device(sycl::queue &q) { + return sycl::detail::getSyclObjImpl(q.get_device())->getHandleRef(); +} + +static inline sycl::detail::AdapterPtr get_adapter(sycl::queue &q) { + return sycl::detail::getSyclObjImpl(q.get_device())->getAdapter(); +} +// n.b. none of these enum converters have a default switch label so we get +// missing enumeration warnings if new enumerations are added to the underlying +// type +static inline ur_exp_tensor_map_data_type_flags_t +datatype_to_ur(tcd::datatype type) { + switch (type) { + case tcd::datatype::type_uint8: + return UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT8; + case tcd::datatype::type_uint16: + return UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT16; + case tcd::datatype::type_uint32: + return UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT32; + case tcd::datatype::type_int32: + return UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_INT32; + case tcd::datatype::type_uint64: + return UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT64; + case tcd::datatype::type_int64: + return UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_INT64; + case tcd::datatype::type_float16: + return UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT16; + case tcd::datatype::type_float32: + return UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT32; + case tcd::datatype::type_float64: + return UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT64; + case tcd::datatype::type_bfloat16: + return UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_BFLOAT16; + case tcd::datatype::type_float32_ftz: + return UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT32_FTZ; + case tcd::datatype::type_tfloat32: + return UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_TFLOAT32; + case tcd::datatype::type_tfloat32_ftz: + return UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_TFLOAT32_FTZ; + } + throw sycl::exception(sycl::errc::invalid); +} + +static inline ur_exp_tensor_map_interleave_flags_t +interleave_to_ur(tcd::interleave interleave) { + switch (interleave) { + case tcd::interleave::interleave_none: + return UR_EXP_TENSOR_MAP_INTERLEAVE_FLAG_NONE; + case tcd::interleave::interleave_16: + return UR_EXP_TENSOR_MAP_INTERLEAVE_FLAG_16B; + case tcd::interleave::interleave_32: + return UR_EXP_TENSOR_MAP_INTERLEAVE_FLAG_32B; + } + throw sycl::exception(sycl::errc::invalid); +} + +static inline ur_exp_tensor_map_swizzle_flags_t +swizzle_to_ur(tcd::swizzle swizzle) { + switch (swizzle) { + case tcd::swizzle::swizzle_none: + return UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_NONE; + case tcd::swizzle::swizzle_32: + return UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_32B; + case tcd::swizzle::swizzle_64: + return UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_64B; + case tcd::swizzle::swizzle_128: + return UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_128B; + } + throw sycl::exception(sycl::errc::invalid); +} + +static inline ur_exp_tensor_map_l2_promotion_flags_t +l2_promote_to_ur(tcd::l2_promote promote) { + switch (promote) { + case tcd::l2_promote::promote_none: + return UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_NONE; + case tcd::l2_promote::promote_l2_64: + return UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_64B; + case tcd::l2_promote::promote_l2_128: + return UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_128B; + case tcd::l2_promote::promote_l2_256: + return UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_256B; + } + throw sycl::exception(sycl::errc::invalid); +} +static inline ur_exp_tensor_map_oob_fill_flags_t +oob_fill_to_ur(tcd::oob_fill fill) { + switch (fill) { + case tcd::oob_fill::oob_fill_none: + return UR_EXP_TENSOR_MAP_OOB_FILL_FLAG_NONE; + case tcd::oob_fill::oob_fill_nan_request_zero_fma: + return UR_EXP_TENSOR_MAP_OOB_FILL_FLAG_REQUEST_ZERO_FMA; + } + throw sycl::exception(sycl::errc::invalid); +} +} // namespace + +namespace sycl { +inline namespace _V1 { +namespace ext::codeplay::experimental::cuda { +tiled_encode_map::tiled_encode_map(queue &q, void *addr, datatype type, + uint32_t rank, + const uint64_t global_dims[/*rank*/], + const uint64_t global_strides[/*rank - 1*/], + const uint32_t box_dims[/*rank*/], + const uint32_t element_strides[/*rank*/], + interleave interleave, swizzle swizzle, + l2_promote promote, oob_fill oob_fill) { + // This static assertion looks a bit funny, due to some fun C++ "features". + // We want to ensure that passing this struct around to kernels works as + // expected (LLVM byval for aggregates in __GRID_CONSTANT__ memory). For that + // to work, the tensor map data space must be the first member of the struct. + // We can't use offsetof here because of visibility (only works with public + // visibility (and it's not really legal for non POD types)). + // We also can't compare pointer differences statically e.g. assert(this == + // data) + // Thus the only thing I can think of to make this validation staticallly is + // to assert that the size of the class is the size of its only member, which + // guarantees the offset is zero. + static_assert(sizeof *this == sizeof data, + "the tensor data must be at offset zero for correct " + "kernel parameter passing"); + + if (!q.get_device().has(sycl::aspect::ext_codeplay_cuda_tensor_map)) { + throw sycl::exception( + make_error_code(errc::feature_not_supported), + "Tensor maps are only supported on CUDA GPUs with SM >= 90"); + } + + auto ur_device_handle = get_ur_device(q); + // XXX This pointer-to-pointer is gross, but the DDI layer generation doesn't + // support opaque types because it needs to allocate the base type. + auto *ur_tensor_map = + reinterpret_cast(this->data); + + auto ur_type = datatype_to_ur(type); + auto ur_swizzle = swizzle_to_ur(swizzle); + auto ur_interleave = interleave_to_ur(interleave); + auto ur_promote = l2_promote_to_ur(promote); + auto ur_fill = oob_fill_to_ur(oob_fill); + + get_adapter(q) + ->call( + ur_device_handle, ur_type, rank, addr, global_dims, global_strides, + box_dims, element_strides, ur_interleave, ur_swizzle, ur_promote, + ur_fill, &ur_tensor_map); +} + +im2col_encode_map::im2col_encode_map( + queue &q, datatype type, uint32_t rank, void *addr, + const uint64_t gmem_dims[/*rank*/], + const uint64_t gmem_strides[/*rank - 1*/], + const int32_t pixel_box_lower_corner[/*rank*/], + const int32_t pixel_box_upper_corner[/*rank*/], uint32_t channels_per_pixel, + uint32_t pixels_per_col, const uint32_t element_strides[/*rank*/], + interleave interleave, swizzle swizzle, l2_promote promote, + oob_fill oob_fill) { + auto ur_device_handle = get_ur_device(q); + // XXX This pointer-to-pointer is gross, but the DDI layer generation doesn't + // support opaque types because it needs to allocate the base type. + auto *ur_tensor_map = + reinterpret_cast(this->data); + + auto ur_type = datatype_to_ur(type); + auto ur_swizzle = swizzle_to_ur(swizzle); + auto ur_interleave = interleave_to_ur(interleave); + auto ur_promote = l2_promote_to_ur(promote); + auto ur_fill = oob_fill_to_ur(oob_fill); + get_adapter(q) + ->call( + ur_device_handle, ur_type, rank, addr, gmem_dims, gmem_strides, + pixel_box_lower_corner, pixel_box_upper_corner, channels_per_pixel, + pixels_per_col, element_strides, ur_interleave, ur_swizzle, + ur_promote, ur_fill, &ur_tensor_map); +} +} // namespace ext::codeplay::experimental::cuda +} // namespace _V1 +} // namespace sycl diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp index 205f5d14eada2..568cd02ae6a43 100644 --- a/sycl/source/detail/device_impl.cpp +++ b/sycl/source/detail/device_impl.cpp @@ -473,6 +473,21 @@ bool device_impl::has(aspect Aspect) const { return get_info(); case aspect::ext_oneapi_native_assert: return isAssertFailSupported(); + case aspect::ext_codeplay_cuda_tensor_map: { + using arch = sycl::ext::oneapi::experimental::architecture; + const arch supported_archs[] = { + arch::nvidia_gpu_sm_90, + arch::nvidia_gpu_sm_90a, + }; + try { + return std::any_of( + std::begin(supported_archs), std::end(supported_archs), + [this](const arch a) { return this->extOneapiArchitectureIs(a); }); + } catch (const sycl::exception &) { + return false; + } + return false; + } case aspect::ext_oneapi_cuda_async_barrier: { int async_barrier_supported; bool call_successful = diff --git a/sycl/test-e2e/Basic/aspects.cpp b/sycl/test-e2e/Basic/aspects.cpp index ea1bbec27762d..3c0b5ba89b8ac 100644 --- a/sycl/test-e2e/Basic/aspects.cpp +++ b/sycl/test-e2e/Basic/aspects.cpp @@ -90,6 +90,9 @@ int main() { if (plt.has(aspect::ext_oneapi_virtual_functions)) { std::cout << " ext_oneapi_virtual_functions" << std::endl; } + if (plt.has(aspect::ext_codeplay_cuda_tensor_map)) { + std::cout << " ext_codeplay_cuda_tensor_map" << std::endl; + } } std::cout << "Passed." << std::endl; return 0; diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 8d27788b92758..72104afd2b893 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -4028,3 +4028,7 @@ _ZNK4sycl3_V19kernel_id8get_nameEv _ZNKSt4hashIN4sycl3_V15queueEEclERKS2_ __sycl_register_lib __sycl_unregister_lib +_ZN4sycl3_V13ext8codeplay12experimental4cuda16tiled_encode_mapC2ERNS0_5queueEPvNS4_6detail24__tensor_copy_descriptor8datatypeEjPKmSD_PKjSF_NSA_10interleaveENSA_7swizzleENSA_10l2_promoteENSA_8oob_fillE +_ZN4sycl3_V13ext8codeplay12experimental4cuda17im2col_encode_mapC1ERNS0_5queueENS4_6detail24__tensor_copy_descriptor8datatypeEjPvPKmSD_PKiSF_jjPKjNS9_10interleaveENS9_7swizzleENS9_10l2_promoteENS9_8oob_fillE +_ZN4sycl3_V13ext8codeplay12experimental4cuda17im2col_encode_mapC2ERNS0_5queueENS4_6detail24__tensor_copy_descriptor8datatypeEjPvPKmSD_PKiSF_jjPKjNS9_10interleaveENS9_7swizzleENS9_10l2_promoteENS9_8oob_fillE +_ZN4sycl3_V13ext8codeplay12experimental4cuda16tiled_encode_mapC1ERNS0_5queueEPvNS4_6detail24__tensor_copy_descriptor8datatypeEjPKmSD_PKjSF_NSA_10interleaveENSA_7swizzleENSA_10l2_promoteENSA_8oob_fillE diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index e8d3186745074..730d686d495b0 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -7,6 +7,12 @@ # REQUIRES: windows # UNSUPPORTED: libcxx +??4im2col_encode_map@cuda@experimental@codeplay@ext@_V1@sycl@@QEAAAEAU0123456@$$QEAU0123456@@Z +??4im2col_encode_map@cuda@experimental@codeplay@ext@_V1@sycl@@QEAAAEAU0123456@AEBU0123456@@Z +??0im2col_encode_map@cuda@experimental@codeplay@ext@_V1@sycl@@QEAA@AEAVqueue@56@W4datatype@__tensor_copy_descriptor@detail@123456@IPEAXQEB_K3QEBH4IIQEBIW4interleave@9detail@123456@W4swizzle@9detail@123456@W4l2_promote@9detail@123456@W4oob_fill@9detail@123456@@Z +??4tiled_encode_map@cuda@experimental@codeplay@ext@_V1@sycl@@QEAAAEAU0123456@AEBU0123456@@Z +??4tiled_encode_map@cuda@experimental@codeplay@ext@_V1@sycl@@QEAAAEAU0123456@$$QEAU0123456@@Z +??0tiled_encode_map@cuda@experimental@codeplay@ext@_V1@sycl@@QEAA@AEAVqueue@56@PEAXW4datatype@__tensor_copy_descriptor@detail@123456@IQEB_K3QEBI4W4interleave@9detail@123456@W4swizzle@9detail@123456@W4l2_promote@9detail@123456@W4oob_fill@9detail@123456@@Z ??$compile@V?$vector@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@V?$allocator@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@2@@std@@@?$online_compiler@$00@experimental@intel@ext@_V1@sycl@@QEAA?AV?$vector@EV?$allocator@E@std@@@std@@AEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@7@AEBV?$vector@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@V?$allocator@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@2@@7@@Z ??$compile@V?$vector@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@V?$allocator@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@2@@std@@@?$online_compiler@$0A@@experimental@intel@ext@_V1@sycl@@QEAA?AV?$vector@EV?$allocator@E@std@@@std@@AEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@7@AEBV?$vector@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@V?$allocator@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@2@@7@@Z ??$create_sub_devices@$0BAIG@@device@_V1@sycl@@QEBA?AV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@std@@_K@Z diff --git a/sycl/test/extensions/cuda_tensor_map/macros_ext_cuda_tensor_map.cpp b/sycl/test/extensions/cuda_tensor_map/macros_ext_cuda_tensor_map.cpp new file mode 100644 index 0000000000000..5bb47825c3ec8 --- /dev/null +++ b/sycl/test/extensions/cuda_tensor_map/macros_ext_cuda_tensor_map.cpp @@ -0,0 +1,9 @@ +// RUN: %clangxx -fsycl -fsyntax-only %s +#include + +#ifndef SYCL_EXT_CODEPLAY_CUDA_TENSOR_MAP +#error SYCL_EXT_CODEPLAY_CUDA_TENSOR_MAP is not defined +#endif +#if SYCL_EXT_CODEPLAY_CUDA_TENSOR_MAP != 1 +#error SYCL_EXT_CODEPLAY_CUDA_TENSOR_MAP has unexpected value +#endif