diff --git a/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td b/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td
index 172ef94b3bd1a..34c7097883271 100644
--- a/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td
+++ b/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td
@@ -86,6 +86,8 @@ def AspectExt_oneapi_cuda_cluster_group : Aspect<"ext_oneapi_cuda_cluster_group"
 def AspectExt_intel_fpga_task_sequence : Aspect<"ext_intel_fpga_task_sequence">;
 def AspectExt_oneapi_atomic16 : Aspect<"ext_oneapi_atomic16">;
 def AspectExt_oneapi_virtual_functions : Aspect<"ext_oneapi_virtual_functions">;
+def AspectExt_codeplay_cuda_tensor_map : Aspect<"ext_codeplay_cuda_tensor_map">;
+
 // Deprecated aspects
 def AspectInt64_base_atomics : Aspect<"int64_base_atomics">;
 def AspectInt64_extended_atomics : Aspect<"int64_extended_atomics">;
@@ -150,7 +152,9 @@ def : TargetInfo<"__TestAspectList",
     AspectExt_oneapi_queue_profiling_tag, AspectExt_oneapi_virtual_mem, AspectExt_oneapi_cuda_cluster_group, 
     AspectExt_intel_fpga_task_sequence,
     AspectExt_oneapi_atomic16,
-    AspectExt_oneapi_virtual_functions],
+    AspectExt_oneapi_virtual_functions,
+    AspectExt_codeplay_cuda_tensor_map,
+    ],
     []>;
 // This definition serves the only purpose of testing whether the deprecated aspect list defined in here and in SYCL RT
 // match.
@@ -265,9 +269,21 @@ def : CudaTargetInfo<"nvidia_gpu_sm_87", !listconcat(CudaMinAspects, CudaBindles
 def : CudaTargetInfo<"nvidia_gpu_sm_89", !listconcat(CudaMinAspects, CudaBindlessImagesAspects,
     [AspectFp16, AspectAtomic64, AspectExt_oneapi_cuda_async_barrier])>;
 def : CudaTargetInfo<"nvidia_gpu_sm_90", !listconcat(CudaMinAspects, CudaBindlessImagesAspects,
-    [AspectFp16, AspectAtomic64, AspectExt_oneapi_cuda_async_barrier, AspectExt_oneapi_cuda_cluster_group])>;
+    [
+      AspectFp16,
+      AspectAtomic64,
+      AspectExt_oneapi_cuda_async_barrier,
+      AspectExt_oneapi_cuda_cluster_group,
+      AspectExt_codeplay_cuda_tensor_map,
+    ])>;
 def : CudaTargetInfo<"nvidia_gpu_sm_90a", !listconcat(CudaMinAspects, CudaBindlessImagesAspects,
-    [AspectFp16, AspectAtomic64, AspectExt_oneapi_cuda_async_barrier, AspectExt_oneapi_cuda_cluster_group])>;
+    [
+      AspectFp16,
+      AspectAtomic64,
+      AspectExt_oneapi_cuda_async_barrier,
+      AspectExt_oneapi_cuda_cluster_group,
+      AspectExt_codeplay_cuda_tensor_map,
+    ])>;
 
 //
 // HIP / AMDGPU device aspects
diff --git a/sycl/doc/extensions/experimental/sycl_ext_codeplay_cuda_tensor_map.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_codeplay_cuda_tensor_map.asciidoc
new file mode 100644
index 0000000000000..3f89b734915e3
--- /dev/null
+++ b/sycl/doc/extensions/experimental/sycl_ext_codeplay_cuda_tensor_map.asciidoc
@@ -0,0 +1,367 @@
+= sycl_ext_codeplay_cuda_tensor_map
+
+:source-highlighter: coderay
+:coderay-linenums-mode: table
+
+// This section needs to be after the document title.
+:doctype: book
+:toc2:
+:toc: left
+:encoding: utf-8
+:lang: en
+:dpcpp: pass:[DPC++]
+:cuda-guide-using-tma: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#using-tma-to-transfer-multi-dimensional-arrays
+:cuda-guide-async-copies: https://docs.nvidia.com/cuda/cuda-c-programming-guide/#asynchronous-data-copies-using-tensor-memory-access-tma
+
+// Set the default source code type in this document to C++,
+// for syntax highlighting purposes.  This is needed because
+// docbook uses c++ and html5 uses cpp.
+:language: {basebackend@docbook:c++:cpp}
+
+== Notice
+
+[%hardbreaks]
+Copyright (C) Codeplay Software Limited.  All rights reserved.
+
+Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks
+of The Khronos Group Inc.  OpenCL(TM) is a trademark of Apple Inc. used by
+permission by Khronos.
+
+== Contact
+
+To report problems with this extension, please open a new issue at:
+
+https://github.com/intel/llvm/issues
+
+== Dependencies
+
+This extension is written against the SYCL 2020 revision 9 specification.  All
+references below to the "core SYCL specification" or to section numbers in the
+SYCL specification refer to that revision.
+
+== Status
+
+This is an experimental extension specification, intended to provide early
+access to features and gather community feedback. Interfaces defined in
+this specification are implemented in {dpcpp}, but they are not finalized
+and may change incompatibly in future versions of {dpcpp} without prior notice.
+*Shipping software products should not rely on APIs defined in
+this specification.*
+
+[NOTE]
+====
+This extension is currently implemented in {dpcpp} only for NVIDIA GPU devices
+with Compute Capability of 9.0 or above and only when using the CUDA backend.
+
+====
+
+== Introduction
+
+This document describes an extension that adds interfaces enabling OneAPI
+implementers access to CUDA's Tensor Map Access (TMA) APIs from within SYCL
+kernels. These interfaces provide utilities to enable accelerated copies of
+multidimensional arrays of various types. There is no novelty here; only the
+plumbing needed to access accelerated features.
+
+== Specification
+
+=== Feature Test Macro
+
+This extension provides a feature-test macro as described in the core SYCL
+specification section 6.3.3 "Feature test macros".  Therefore, an
+implementation supporting this extension must predefine the macro
+`SYCL_EXT_CODEPLAY_CUDA_TENSOR_MAP` to one of the values defined in the table
+below. Applications can test for the existence of this macro to determine if
+the implementation supports this feature, or applications can test the macro's
+value to determine which of the extension's APIs the implementation supports.
+
+[%header,cols="1,5"]
+|===
+|Value
+|Description
+
+|1
+|Initial version of this extension
+|===
+
+=== Overview
+
+Certain libraries shipped with OneAPI may need access to proprietary accelerator
+extensions to enable good performance via use of driver-provided features that
+are beyond the reach of the compiler or language model. One such extension is
+CUDA's link:{cuda-guide-async-copies}[Tensor Memory Access] interface which
+enables accelerated copies of multidimensional arrays. This is *not* a feature
+that can be implemented in the compiler directly. This is because the parameters
+for the tensor copy adhere to an unusual ABI and the context type used for
+initializing such operations requires a call to the driver from the host. The
+initialized data can then *only* be acted upon within the device context. Thus:
+gaining access to accelerated "tensor" copies within SYCL requires interfaces
+that emulates this pattern in such a way it's possible to gain access to the
+user data from within kernels.
+
+[NOTE]
+====
+These interfaces are for initializing asynchronous *multidimensional* array
+copies only. One dimensional array copies should be preferably performed using
+standard sycl memcpy features.
+
+====
+
+==== Interface
+
+`sycl_ext_codeplay_cuda_tensor_map` defines two classes and a number of
+enumerations for initializing the CUDA TMA context objects
+
+classes:
+
+- `tiled_encode_map`
+- `im2col_encode_map`
+
+enumerations
+
+- `datatype`
+- `interleave`
+- `swizzle`
+- `l2_promote`
+- `oob_fill`
+
+They are analogous to their CUDA namesakes. The size, alignment and layout of
+the structs are unspecified.
+
+Each class has a single constructor whose parameters control the tensor copy
+operation implied by the class name.
+
+
+The `tiled_encode_map` class is used to initialize CUDA state used for tiled
+copies of multidimensional arrays
+Its arguments are analogous to its CUDA namesake
+link:{cuda-guide-using-tma}[c.f. CUDA programming guide]
+
+  tiled_encode_map(queue &q, void *addr, datatype type, uint32_t rank,
+                   const uint64_t global_dims[/*rank*/],
+                   const uint64_t global_strides[/*rank - 1*/],
+                   const uint32_t box_dims[/*rank*/],
+                   const uint32_t element_strides[/*rank*/],
+                   interleave interleave, swizzle swizzle, l2_promote promote,
+                   oob_fill oob_fill);
+
+The `im2col_encode_map` class is used to initialize CUDA state used for
+asynchronous copies with a re-encoding of blocks to columns.
+Its arguments are analagous to its CUDA namesake
+link:{cuda-guide-using-tma}[c.f CUDA programming guide]
+
+When passed to a kernel by value, either of these class objects can then have
+their address taken and passed as the second operand of the
+`cp.async.bulk.tensor` family of PTX instructions via inline assembly.
+No other operation is supported.
+
+These objects can be constructed in host code only. It is undefined to attempt
+to construct them in a kernel. Using the address of copied objects for the
+CUDA tensor operations is undefined.
+
+==== Sample Header for host code only
+
+[source, c++]
+----
+namespace sycl::ext::codeplay::experimental::cuda {
+  enum datatype : int {
+    type_uint8,
+    type_uint16,
+    type_uint32,
+    type_int32,
+    type_uint64,
+    type_int64,
+    type_float16,
+    type_float32,
+    type_float64,
+    type_bfloat16,
+    type_float32_ftz,
+    type_tfloat32,
+    type_tfloat32_ftz,
+  };
+  enum interleave : int {
+    interleave_none,
+    interleave_16,
+    interleave_32,
+  };
+  enum swizzle : int {
+    swizzle_none,
+    swizzle_32,
+    swizzle_64,
+    swizzle_128,
+  };
+  enum l2_promote : int {
+    promote_none,
+    promote_l2_64,
+    promote_l2_128,
+    promote_l2_256,
+  };
+  enum oob_fill : int {
+    oob_fill_none,
+    oob_fill_nan_request_zero_fma,
+  };
+struct tiled_encode_map {
+  tiled_encode_map(queue &q, void *addr, datatype type, uint32_t rank,
+                   const uint64_t global_dims[/*rank*/],
+                   const uint64_t global_strides[/*rank - 1*/],
+                   const uint32_t box_dims[/*rank*/],
+                   const uint32_t element_strides[/*rank*/],
+                   interleave interleave, swizzle swizzle, l2_promote promote,
+                   oob_fill oob_fill);
+private:
+// Implementation defined members must be private
+};
+
+struct im2col_encode_map {
+  enum datatype : int {
+    type_uint8,
+    type_uint16,
+    type_uint32,
+    type_int32,
+    type_uint64,
+    type_int64,
+    type_float16,
+    type_float32,
+    type_float64,
+    type_bfloat16,
+    type_float32_ftz,
+    type_tfloat32,
+    type_tfloat32_ftz,
+  };
+  enum interleave : int {
+    interleave_none,
+    interleave_16,
+    interleave_32,
+  };
+  enum swizzle : int {
+    swizzle_none,
+    swizzle_32,
+    swizzle_64,
+    swizzle_128,
+  };
+  enum l2_promote : int {
+    promote_none,
+    promote_l2_64,
+    promote_l2_128,
+    promote_l2_256,
+  };
+  enum oob_fill : int {
+    oob_fill_none,
+    oob_fill_nan_request_zero_fma,
+  };
+// Implementation defined members must be private
+  im2col_encode_map(queue &q, datatype type, uint32_t rank, void *addr,
+                    const uint64_t gmem_dims[/*rank*/],
+                    const uint64_t gmem_strides[/*rank - 1*/],
+                    const int32_t pixel_box_lower_corner[/*rank*/],
+                    const int32_t pixel_box_upper_corner[/*rank*/],
+                    uint32_t channels_per_pixel, uint32_t pixels_per_col,
+                    const uint32_t element_strides[/*rank*/],
+                    interleave interleave, swizzle swizzle, l2_promote promote,
+                    oob_fill oob_fill);
+};
+}
+----
+
+==== Sample Header for device code only
+
+[source, c++]
+----
+namespace sycl::ext::codeplay::experimental::cuda {
+class tiled_encode_map {
+public:
+  // Get access to the TMA descriptor for use as an operand to the
+  // cp.async.bulk.tensor family of PTX instructions
+  uintptr_t get_native_descriptor();
+};
+class im2col_encode_map {
+  // Get access to the TMA descriptor for use as an operand to the
+  // cp.async.bulk.tensor family of PTX instructions
+ uintptr_t get_native_descriptor();
+};
+}
+----
+
+== Examples
+
+[source, c++]
+----
+#include <cstdint>
+#include <cassert>
+
+#include <sycl/sycl.hpp>
+#include <sycl/ext/codeplay/experimental/cuda_tensor_map.hpp>
+#include <sycl/ext/oneapi/work_group_static.hpp>
+#include <sycl/ext/oneapi/work_group_scratch_memory.hpp>
+
+using namespace sycl;
+using namespace sycl::ext::codeplay::experimental::cuda;
+namespace sycl_ext = sycl::ext::oneapi::experimental;
+#define rank 2
+
+#define WIDTH (256)
+#define HEIGHT (8)
+int main() {
+  device cuda_dev{
+    [](const sycl::device &dev) {
+      return dev.get_backend() == sycl::backend::ext_oneapi_cuda ? 1 : -1;
+    }
+  };
+  bool has_aspect = cuda_dev.has(aspect::ext_codeplay_cuda_tensor_map);
+  assert(has_aspect);
+  queue q{cuda_dev};
+  auto *mem = sycl::malloc_device<int32_t>(WIDTH * HEIGHT, q);
+
+  uint64_t global_dims[rank] = {WIDTH, HEIGHT};
+  uint64_t global_strides[rank - 1] = {WIDTH};
+  uint32_t box_dims[rank] = {WIDTH / 2, HEIGHT / 2};
+  uint32_t element_strides[rank] = {1, 1};
+
+  tiled_encode_map tile(
+    q,
+    static_cast<void *>(mem),
+    tiled_encode_map::datatype::type_int32,
+    rank,
+    global_dims,
+    global_strides,
+    box_dims,
+    element_strides,
+    tiled_encode_map::interleave::interleave_none,
+    tiled_encode_map::swizzle::swizzle_none,
+    tiled_encode_map::l2_promote::promote_none,
+    tiled_encode_map::oob_fill::oob_fill_none
+  );
+
+  q.submit([&](handler &Cgh) {
+    sycl_ext::work_group_scratch_size static_size{WIDTH * HEIGHT * sizeof (int32_t)};
+    sycl_ext::properties properties{static_size};
+    cgh.parallel_for(nd_range<1>(range<1>(Size), range<1>(WgSize)), properties,
+                                [=](nd_item<1> Item) {
+        sycl_ext::work_group_static<int64_t> barrier_mem;
+        auto smem_ptr = reinterpret_cast<uintptr_t>(sycl_ext::static_address_cast<
+                      sycl::access::address_space::local_space>(sycl_ext::get_work_group_scratch_memory()).get_decorated());
+        auto bar_ptr = reinterpret_cast<uintptr_t>(sycl_ext::static_address_cast<
+                      sycl::access::address_space::local_space>(&barrier_mem).get_decorated())
+        (void)tile;
+        (void)shmem;
+#ifdef __SYCL_DEVICE_ONLY__
+  uint32_t smem_int_bar = 0;
+  int32_t tc0 = 0;
+  int32_t tc1 = 0;
+  asm volatile (
+    "cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes"
+    " [%[smem_int_ptr]], [%[tma_descriptor], {%[tc0], %[tc1]}], [%[bar_ptr]];"
+    :
+    : [smem_int_ptr] "r" (smem_ptr),
+      [tma_descriptor] "l" (tile.get_native_descriptor()),
+      [bar_ptr] "r" (bar_ptr),
+      [tc0] "r" (tc0),
+      [tc1] "r" (tc1)
+    : "memory"
+  );
+#endif
+    });
+    // Do stuff with shared memory now...
+  }).wait();
+}
+----
diff --git a/sycl/include/sycl/ext/codeplay/experimental/cuda_tensor_map.hpp b/sycl/include/sycl/ext/codeplay/experimental/cuda_tensor_map.hpp
new file mode 100644
index 0000000000000..89b477e336c53
--- /dev/null
+++ b/sycl/include/sycl/ext/codeplay/experimental/cuda_tensor_map.hpp
@@ -0,0 +1,117 @@
+//==----------------- tensor_map.hpp --- CUDA TMA interop wrappers ---------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstdint>
+
+#include <sycl/detail/export.hpp>
+
+#define SYCL_EXT_CODEPLAY_CUDA_TENSOR_MAP 1
+
+namespace sycl {
+inline namespace _V1 {
+class queue;
+namespace ext::codeplay::experimental::cuda {
+namespace detail {
+/// An opaque type passed to the runtime used to describe the properties of an
+/// image.
+
+struct alignas(64) __tensor_copy_descriptor {
+protected:
+  unsigned char data[128];
+
+public:
+  // It'd be nice to shorten these enumeration names a little, but since many of
+  // them start with numbers, that'd be an illegal name, and nobody is going to
+  // prefer typing `tensor_copy_descriptor::interleave::sixteen` over
+  // `tensor_copy_descriptor::interleave_16`. Additionally, naming the type
+  // enumerations after the type they represent is sketchy since there are so
+  // many variations of uint32 et al in the wild. Thus: in the name of
+  // consistency all enumerators here duplicatively encode the type in their
+  // names
+  enum datatype : int {
+    type_uint8,
+    type_uint16,
+    type_uint32,
+    type_int32,
+    type_uint64,
+    type_int64,
+    type_float16,
+    type_float32,
+    type_float64,
+    type_bfloat16,
+    type_float32_ftz,
+    type_tfloat32,
+    type_tfloat32_ftz,
+  };
+  enum interleave : int {
+    interleave_none,
+    interleave_16,
+    interleave_32,
+  };
+  enum swizzle : int {
+    swizzle_none,
+    swizzle_32,
+    swizzle_64,
+    swizzle_128,
+  };
+  enum l2_promote : int {
+    promote_none,
+    promote_l2_64,
+    promote_l2_128,
+    promote_l2_256,
+  };
+  enum oob_fill : int {
+    oob_fill_none,
+    oob_fill_nan_request_zero_fma,
+  };
+};
+} // namespace detail
+
+struct __SYCL_EXPORT tiled_encode_map final
+    : public detail::__tensor_copy_descriptor {
+  tiled_encode_map() = delete;
+  // Can't be constructed on device, only passed into kernels from the host
+  tiled_encode_map(queue &q, void *addr, datatype type, uint32_t rank,
+                   const uint64_t global_dims[/*rank*/],
+                   const uint64_t global_strides[/*rank - 1*/],
+                   const uint32_t box_dims[/*rank*/],
+                   const uint32_t element_strides[/*rank*/],
+                   interleave interleave, swizzle swizzle, l2_promote promote,
+                   oob_fill oob_fill);
+#ifdef __SYCL_DEVICE_ONLY__
+  uintptr_t get_native_descriptor() const {
+    return reinterpret_cast<uintptr_t>(this);
+  }
+#endif
+};
+
+struct __SYCL_EXPORT im2col_encode_map final
+    : public detail::__tensor_copy_descriptor {
+  im2col_encode_map() = delete;
+  // Can't be constructed on device, only passed into kernels from the host
+  im2col_encode_map(queue &q, datatype type, uint32_t rank, void *addr,
+                    const uint64_t gmem_dims[/*rank*/],
+                    const uint64_t gmem_strides[/*rank - 1*/],
+                    const int32_t pixel_box_lower_corner[/*rank*/],
+                    const int32_t pixel_box_upper_corner[/*rank*/],
+                    uint32_t channels_per_pixel, uint32_t pixels_per_col,
+                    const uint32_t element_strides[/*rank*/],
+                    interleave interleave, swizzle swizzle, l2_promote promote,
+                    oob_fill oob_fill);
+#ifdef __SYCL_DEVICE_ONLY__
+  uintptr_t get_native_descriptor() const {
+    return reinterpret_cast<uintptr_t>(this);
+  }
+#endif
+};
+
+} // namespace ext::codeplay::experimental::cuda
+} // namespace _V1
+} // namespace sycl
diff --git a/sycl/include/sycl/info/aspects.def b/sycl/include/sycl/info/aspects.def
index 8a931dde35a71..fe0072d3ea473 100644
--- a/sycl/include/sycl/info/aspects.def
+++ b/sycl/include/sycl/info/aspects.def
@@ -72,3 +72,4 @@ __SYCL_ASPECT(ext_oneapi_bindless_images_sample_1d_usm, 78)
 __SYCL_ASPECT(ext_oneapi_bindless_images_sample_2d_usm, 79)
 __SYCL_ASPECT(ext_oneapi_atomic16, 80)
 __SYCL_ASPECT(ext_oneapi_virtual_functions, 81)
+__SYCL_ASPECT(ext_codeplay_cuda_tensor_map, 82)
diff --git a/sycl/source/CMakeLists.txt b/sycl/source/CMakeLists.txt
index 7bb35a9e158cd..c42fc870d4fa8 100644
--- a/sycl/source/CMakeLists.txt
+++ b/sycl/source/CMakeLists.txt
@@ -285,6 +285,7 @@ set(SYCL_COMMON_SOURCES
     "detail/scheduler/graph_builder.cpp"
     "detail/spec_constant_impl.cpp"
     "detail/sycl_mem_obj_t.cpp"
+    "detail/cuda_tensor_map.cpp"
     "detail/usm/usm_impl.cpp"
     "detail/ur.cpp"
     "detail/util.cpp"
diff --git a/sycl/source/detail/cuda_tensor_map.cpp b/sycl/source/detail/cuda_tensor_map.cpp
new file mode 100644
index 0000000000000..1ab5e18a6bb52
--- /dev/null
+++ b/sycl/source/detail/cuda_tensor_map.cpp
@@ -0,0 +1,198 @@
+//==----------------- tensor_map.cpp --- CUDA TMA interop wrappers ---------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+#include "detail/device_impl.hpp"
+#include <sycl/aspects.hpp>
+#include <sycl/detail/ur.hpp>
+#include <sycl/ext/codeplay/experimental/cuda_tensor_map.hpp>
+#include <sycl/queue.hpp>
+
+namespace {
+using tcd =
+    sycl::ext::codeplay::experimental::cuda::detail::__tensor_copy_descriptor;
+
+static inline ur_device_handle_t get_ur_device(sycl::queue &q) {
+  return sycl::detail::getSyclObjImpl(q.get_device())->getHandleRef();
+}
+
+static inline sycl::detail::AdapterPtr get_adapter(sycl::queue &q) {
+  return sycl::detail::getSyclObjImpl(q.get_device())->getAdapter();
+}
+// n.b. none of these enum converters have a default switch label so we get
+// missing enumeration warnings if new enumerations are added to the underlying
+// type
+static inline ur_exp_tensor_map_data_type_flags_t
+datatype_to_ur(tcd::datatype type) {
+  switch (type) {
+  case tcd::datatype::type_uint8:
+    return UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT8;
+  case tcd::datatype::type_uint16:
+    return UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT16;
+  case tcd::datatype::type_uint32:
+    return UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT32;
+  case tcd::datatype::type_int32:
+    return UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_INT32;
+  case tcd::datatype::type_uint64:
+    return UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT64;
+  case tcd::datatype::type_int64:
+    return UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_INT64;
+  case tcd::datatype::type_float16:
+    return UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT16;
+  case tcd::datatype::type_float32:
+    return UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT32;
+  case tcd::datatype::type_float64:
+    return UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT64;
+  case tcd::datatype::type_bfloat16:
+    return UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_BFLOAT16;
+  case tcd::datatype::type_float32_ftz:
+    return UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT32_FTZ;
+  case tcd::datatype::type_tfloat32:
+    return UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_TFLOAT32;
+  case tcd::datatype::type_tfloat32_ftz:
+    return UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_TFLOAT32_FTZ;
+  }
+  throw sycl::exception(sycl::errc::invalid);
+}
+
+static inline ur_exp_tensor_map_interleave_flags_t
+interleave_to_ur(tcd::interleave interleave) {
+  switch (interleave) {
+  case tcd::interleave::interleave_none:
+    return UR_EXP_TENSOR_MAP_INTERLEAVE_FLAG_NONE;
+  case tcd::interleave::interleave_16:
+    return UR_EXP_TENSOR_MAP_INTERLEAVE_FLAG_16B;
+  case tcd::interleave::interleave_32:
+    return UR_EXP_TENSOR_MAP_INTERLEAVE_FLAG_32B;
+  }
+  throw sycl::exception(sycl::errc::invalid);
+}
+
+static inline ur_exp_tensor_map_swizzle_flags_t
+swizzle_to_ur(tcd::swizzle swizzle) {
+  switch (swizzle) {
+  case tcd::swizzle::swizzle_none:
+    return UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_NONE;
+  case tcd::swizzle::swizzle_32:
+    return UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_32B;
+  case tcd::swizzle::swizzle_64:
+    return UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_64B;
+  case tcd::swizzle::swizzle_128:
+    return UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_128B;
+  }
+  throw sycl::exception(sycl::errc::invalid);
+}
+
+static inline ur_exp_tensor_map_l2_promotion_flags_t
+l2_promote_to_ur(tcd::l2_promote promote) {
+  switch (promote) {
+  case tcd::l2_promote::promote_none:
+    return UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_NONE;
+  case tcd::l2_promote::promote_l2_64:
+    return UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_64B;
+  case tcd::l2_promote::promote_l2_128:
+    return UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_128B;
+  case tcd::l2_promote::promote_l2_256:
+    return UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_256B;
+  }
+  throw sycl::exception(sycl::errc::invalid);
+}
+static inline ur_exp_tensor_map_oob_fill_flags_t
+oob_fill_to_ur(tcd::oob_fill fill) {
+  switch (fill) {
+  case tcd::oob_fill::oob_fill_none:
+    return UR_EXP_TENSOR_MAP_OOB_FILL_FLAG_NONE;
+  case tcd::oob_fill::oob_fill_nan_request_zero_fma:
+    return UR_EXP_TENSOR_MAP_OOB_FILL_FLAG_REQUEST_ZERO_FMA;
+  }
+  throw sycl::exception(sycl::errc::invalid);
+}
+} // namespace
+
+namespace sycl {
+inline namespace _V1 {
+namespace ext::codeplay::experimental::cuda {
+tiled_encode_map::tiled_encode_map(queue &q, void *addr, datatype type,
+                                   uint32_t rank,
+                                   const uint64_t global_dims[/*rank*/],
+                                   const uint64_t global_strides[/*rank - 1*/],
+                                   const uint32_t box_dims[/*rank*/],
+                                   const uint32_t element_strides[/*rank*/],
+                                   interleave interleave, swizzle swizzle,
+                                   l2_promote promote, oob_fill oob_fill) {
+  // This static assertion looks a bit funny, due to some fun C++ "features".
+  // We want to ensure that passing this struct around to kernels works as
+  // expected (LLVM byval for aggregates in __GRID_CONSTANT__ memory). For that
+  // to work, the tensor map data space must be the first member of the struct.
+  // We can't use offsetof here because of visibility (only works with public
+  // visibility (and it's not really legal for non POD types)).
+  // We also can't compare pointer differences statically e.g. assert(this ==
+  // data)
+  // Thus the only thing I can think of to make this validation staticallly is
+  // to assert that the size of the class is the size of its only member, which
+  // guarantees the offset is zero.
+  static_assert(sizeof *this == sizeof data,
+                "the tensor data must be at offset zero for correct "
+                "kernel parameter passing");
+
+  if (!q.get_device().has(sycl::aspect::ext_codeplay_cuda_tensor_map)) {
+    throw sycl::exception(
+        make_error_code(errc::feature_not_supported),
+        "Tensor maps are only supported on CUDA GPUs with SM >= 90");
+  }
+
+  auto ur_device_handle = get_ur_device(q);
+  // XXX This pointer-to-pointer is gross, but the DDI layer generation doesn't
+  // support opaque types because it needs to allocate the base type.
+  auto *ur_tensor_map =
+      reinterpret_cast<ur_exp_tensor_map_handle_t>(this->data);
+
+  auto ur_type = datatype_to_ur(type);
+  auto ur_swizzle = swizzle_to_ur(swizzle);
+  auto ur_interleave = interleave_to_ur(interleave);
+  auto ur_promote = l2_promote_to_ur(promote);
+  auto ur_fill = oob_fill_to_ur(oob_fill);
+
+  get_adapter(q)
+      ->call<sycl::errc::invalid,
+             sycl::detail::UrApiKind::urTensorMapEncodeTiledExp>(
+          ur_device_handle, ur_type, rank, addr, global_dims, global_strides,
+          box_dims, element_strides, ur_interleave, ur_swizzle, ur_promote,
+          ur_fill, &ur_tensor_map);
+}
+
+im2col_encode_map::im2col_encode_map(
+    queue &q, datatype type, uint32_t rank, void *addr,
+    const uint64_t gmem_dims[/*rank*/],
+    const uint64_t gmem_strides[/*rank - 1*/],
+    const int32_t pixel_box_lower_corner[/*rank*/],
+    const int32_t pixel_box_upper_corner[/*rank*/], uint32_t channels_per_pixel,
+    uint32_t pixels_per_col, const uint32_t element_strides[/*rank*/],
+    interleave interleave, swizzle swizzle, l2_promote promote,
+    oob_fill oob_fill) {
+  auto ur_device_handle = get_ur_device(q);
+  // XXX This pointer-to-pointer is gross, but the DDI layer generation doesn't
+  // support opaque types because it needs to allocate the base type.
+  auto *ur_tensor_map =
+      reinterpret_cast<ur_exp_tensor_map_handle_t>(this->data);
+
+  auto ur_type = datatype_to_ur(type);
+  auto ur_swizzle = swizzle_to_ur(swizzle);
+  auto ur_interleave = interleave_to_ur(interleave);
+  auto ur_promote = l2_promote_to_ur(promote);
+  auto ur_fill = oob_fill_to_ur(oob_fill);
+  get_adapter(q)
+      ->call<sycl::errc::runtime,
+             sycl::detail::UrApiKind::urTensorMapEncodeIm2ColExp>(
+          ur_device_handle, ur_type, rank, addr, gmem_dims, gmem_strides,
+          pixel_box_lower_corner, pixel_box_upper_corner, channels_per_pixel,
+          pixels_per_col, element_strides, ur_interleave, ur_swizzle,
+          ur_promote, ur_fill, &ur_tensor_map);
+}
+} // namespace ext::codeplay::experimental::cuda
+} // namespace _V1
+} // namespace sycl
diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp
index 205f5d14eada2..568cd02ae6a43 100644
--- a/sycl/source/detail/device_impl.cpp
+++ b/sycl/source/detail/device_impl.cpp
@@ -473,6 +473,21 @@ bool device_impl::has(aspect Aspect) const {
     return get_info<info::device::ext_oneapi_srgb>();
   case aspect::ext_oneapi_native_assert:
     return isAssertFailSupported();
+  case aspect::ext_codeplay_cuda_tensor_map: {
+    using arch = sycl::ext::oneapi::experimental::architecture;
+    const arch supported_archs[] = {
+        arch::nvidia_gpu_sm_90,
+        arch::nvidia_gpu_sm_90a,
+    };
+    try {
+      return std::any_of(
+          std::begin(supported_archs), std::end(supported_archs),
+          [this](const arch a) { return this->extOneapiArchitectureIs(a); });
+    } catch (const sycl::exception &) {
+      return false;
+    }
+    return false;
+  }
   case aspect::ext_oneapi_cuda_async_barrier: {
     int async_barrier_supported;
     bool call_successful =
diff --git a/sycl/test-e2e/Basic/aspects.cpp b/sycl/test-e2e/Basic/aspects.cpp
index ea1bbec27762d..3c0b5ba89b8ac 100644
--- a/sycl/test-e2e/Basic/aspects.cpp
+++ b/sycl/test-e2e/Basic/aspects.cpp
@@ -90,6 +90,9 @@ int main() {
     if (plt.has(aspect::ext_oneapi_virtual_functions)) {
       std::cout << "  ext_oneapi_virtual_functions" << std::endl;
     }
+    if (plt.has(aspect::ext_codeplay_cuda_tensor_map)) {
+      std::cout << "  ext_codeplay_cuda_tensor_map" << std::endl;
+    }
   }
   std::cout << "Passed." << std::endl;
   return 0;
diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump
index 8d27788b92758..72104afd2b893 100644
--- a/sycl/test/abi/sycl_symbols_linux.dump
+++ b/sycl/test/abi/sycl_symbols_linux.dump
@@ -4028,3 +4028,7 @@ _ZNK4sycl3_V19kernel_id8get_nameEv
 _ZNKSt4hashIN4sycl3_V15queueEEclERKS2_
 __sycl_register_lib
 __sycl_unregister_lib
+_ZN4sycl3_V13ext8codeplay12experimental4cuda16tiled_encode_mapC2ERNS0_5queueEPvNS4_6detail24__tensor_copy_descriptor8datatypeEjPKmSD_PKjSF_NSA_10interleaveENSA_7swizzleENSA_10l2_promoteENSA_8oob_fillE
+_ZN4sycl3_V13ext8codeplay12experimental4cuda17im2col_encode_mapC1ERNS0_5queueENS4_6detail24__tensor_copy_descriptor8datatypeEjPvPKmSD_PKiSF_jjPKjNS9_10interleaveENS9_7swizzleENS9_10l2_promoteENS9_8oob_fillE
+_ZN4sycl3_V13ext8codeplay12experimental4cuda17im2col_encode_mapC2ERNS0_5queueENS4_6detail24__tensor_copy_descriptor8datatypeEjPvPKmSD_PKiSF_jjPKjNS9_10interleaveENS9_7swizzleENS9_10l2_promoteENS9_8oob_fillE
+_ZN4sycl3_V13ext8codeplay12experimental4cuda16tiled_encode_mapC1ERNS0_5queueEPvNS4_6detail24__tensor_copy_descriptor8datatypeEjPKmSD_PKjSF_NSA_10interleaveENSA_7swizzleENSA_10l2_promoteENSA_8oob_fillE
diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump
index e8d3186745074..730d686d495b0 100644
--- a/sycl/test/abi/sycl_symbols_windows.dump
+++ b/sycl/test/abi/sycl_symbols_windows.dump
@@ -7,6 +7,12 @@
 # REQUIRES: windows
 # UNSUPPORTED: libcxx
 
+??4im2col_encode_map@cuda@experimental@codeplay@ext@_V1@sycl@@QEAAAEAU0123456@$$QEAU0123456@@Z
+??4im2col_encode_map@cuda@experimental@codeplay@ext@_V1@sycl@@QEAAAEAU0123456@AEBU0123456@@Z
+??0im2col_encode_map@cuda@experimental@codeplay@ext@_V1@sycl@@QEAA@AEAVqueue@56@W4datatype@__tensor_copy_descriptor@detail@123456@IPEAXQEB_K3QEBH4IIQEBIW4interleave@9detail@123456@W4swizzle@9detail@123456@W4l2_promote@9detail@123456@W4oob_fill@9detail@123456@@Z
+??4tiled_encode_map@cuda@experimental@codeplay@ext@_V1@sycl@@QEAAAEAU0123456@AEBU0123456@@Z
+??4tiled_encode_map@cuda@experimental@codeplay@ext@_V1@sycl@@QEAAAEAU0123456@$$QEAU0123456@@Z
+??0tiled_encode_map@cuda@experimental@codeplay@ext@_V1@sycl@@QEAA@AEAVqueue@56@PEAXW4datatype@__tensor_copy_descriptor@detail@123456@IQEB_K3QEBI4W4interleave@9detail@123456@W4swizzle@9detail@123456@W4l2_promote@9detail@123456@W4oob_fill@9detail@123456@@Z
 ??$compile@V?$vector@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@V?$allocator@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@2@@std@@@?$online_compiler@$00@experimental@intel@ext@_V1@sycl@@QEAA?AV?$vector@EV?$allocator@E@std@@@std@@AEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@7@AEBV?$vector@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@V?$allocator@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@2@@7@@Z
 ??$compile@V?$vector@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@V?$allocator@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@2@@std@@@?$online_compiler@$0A@@experimental@intel@ext@_V1@sycl@@QEAA?AV?$vector@EV?$allocator@E@std@@@std@@AEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@7@AEBV?$vector@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@V?$allocator@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@2@@7@@Z
 ??$create_sub_devices@$0BAIG@@device@_V1@sycl@@QEBA?AV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@std@@_K@Z
diff --git a/sycl/test/extensions/cuda_tensor_map/macros_ext_cuda_tensor_map.cpp b/sycl/test/extensions/cuda_tensor_map/macros_ext_cuda_tensor_map.cpp
new file mode 100644
index 0000000000000..5bb47825c3ec8
--- /dev/null
+++ b/sycl/test/extensions/cuda_tensor_map/macros_ext_cuda_tensor_map.cpp
@@ -0,0 +1,9 @@
+// RUN: %clangxx -fsycl -fsyntax-only %s
+#include <sycl/ext/codeplay/experimental/cuda_tensor_map.hpp>
+
+#ifndef SYCL_EXT_CODEPLAY_CUDA_TENSOR_MAP
+#error SYCL_EXT_CODEPLAY_CUDA_TENSOR_MAP is not defined
+#endif
+#if SYCL_EXT_CODEPLAY_CUDA_TENSOR_MAP != 1
+#error SYCL_EXT_CODEPLAY_CUDA_TENSOR_MAP has unexpected value
+#endif