From 1fba3c966e115da411cedf9f56ec6e74f2ea4ae6 Mon Sep 17 00:00:00 2001
From: Luke Drummond <luke.drummond@codeplay.com>
Date: Tue, 16 Jul 2024 16:26:13 +0100
Subject: [PATCH] [SYCL][ext] Define and Implement sycl_ext_tensor_map

This is a fairly mechanical implementation of the basic infrastructure
required to access CUDA TMA descriptors from within SYCL kernels, while
initializing them on the host. The new feature exposes two new classes
and associated support structure in
`sycl::ext::codeplay::experimental::cuda`.

There's some ugliness involved to make this work on account of the way
NVIDIA implemented this basic feature, but it's all in the name of
{legitimate-field-of-endeavour}.
---
 .../llvm/SYCLLowerIR/DeviceConfigFile.td      |  22 +-
 ...sycl_ext_codeplay_cuda_tensor_map.asciidoc | 367 ++++++++++++++++++
 .../codeplay/experimental/cuda_tensor_map.hpp | 117 ++++++
 sycl/include/sycl/info/aspects.def            |   1 +
 sycl/source/CMakeLists.txt                    |   1 +
 sycl/source/detail/cuda_tensor_map.cpp        | 198 ++++++++++
 sycl/source/detail/device_impl.cpp            |  15 +
 sycl/test-e2e/Basic/aspects.cpp               |   3 +
 sycl/test/abi/sycl_symbols_linux.dump         |   4 +
 sycl/test/abi/sycl_symbols_windows.dump       |   6 +
 .../macros_ext_cuda_tensor_map.cpp            |   9 +
 11 files changed, 740 insertions(+), 3 deletions(-)
 create mode 100644 sycl/doc/extensions/experimental/sycl_ext_codeplay_cuda_tensor_map.asciidoc
 create mode 100644 sycl/include/sycl/ext/codeplay/experimental/cuda_tensor_map.hpp
 create mode 100644 sycl/source/detail/cuda_tensor_map.cpp
 create mode 100644 sycl/test/extensions/cuda_tensor_map/macros_ext_cuda_tensor_map.cpp

diff --git a/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td b/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td
index 172ef94b3bd1a..34c7097883271 100644
--- a/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td
+++ b/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td
@@ -86,6 +86,8 @@ def AspectExt_oneapi_cuda_cluster_group : Aspect<"ext_oneapi_cuda_cluster_group"
 def AspectExt_intel_fpga_task_sequence : Aspect<"ext_intel_fpga_task_sequence">;
 def AspectExt_oneapi_atomic16 : Aspect<"ext_oneapi_atomic16">;
 def AspectExt_oneapi_virtual_functions : Aspect<"ext_oneapi_virtual_functions">;
+def AspectExt_codeplay_cuda_tensor_map : Aspect<"ext_codeplay_cuda_tensor_map">;
+
 // Deprecated aspects
 def AspectInt64_base_atomics : Aspect<"int64_base_atomics">;
 def AspectInt64_extended_atomics : Aspect<"int64_extended_atomics">;
@@ -150,7 +152,9 @@ def : TargetInfo<"__TestAspectList",
     AspectExt_oneapi_queue_profiling_tag, AspectExt_oneapi_virtual_mem, AspectExt_oneapi_cuda_cluster_group, 
     AspectExt_intel_fpga_task_sequence,
     AspectExt_oneapi_atomic16,
-    AspectExt_oneapi_virtual_functions],
+    AspectExt_oneapi_virtual_functions,
+    AspectExt_codeplay_cuda_tensor_map,
+    ],
     []>;
 // This definition serves the only purpose of testing whether the deprecated aspect list defined in here and in SYCL RT
 // match.
@@ -265,9 +269,21 @@ def : CudaTargetInfo<"nvidia_gpu_sm_87", !listconcat(CudaMinAspects, CudaBindles
 def : CudaTargetInfo<"nvidia_gpu_sm_89", !listconcat(CudaMinAspects, CudaBindlessImagesAspects,
     [AspectFp16, AspectAtomic64, AspectExt_oneapi_cuda_async_barrier])>;
 def : CudaTargetInfo<"nvidia_gpu_sm_90", !listconcat(CudaMinAspects, CudaBindlessImagesAspects,
-    [AspectFp16, AspectAtomic64, AspectExt_oneapi_cuda_async_barrier, AspectExt_oneapi_cuda_cluster_group])>;
+    [
+      AspectFp16,
+      AspectAtomic64,
+      AspectExt_oneapi_cuda_async_barrier,
+      AspectExt_oneapi_cuda_cluster_group,
+      AspectExt_codeplay_cuda_tensor_map,
+    ])>;
 def : CudaTargetInfo<"nvidia_gpu_sm_90a", !listconcat(CudaMinAspects, CudaBindlessImagesAspects,
-    [AspectFp16, AspectAtomic64, AspectExt_oneapi_cuda_async_barrier, AspectExt_oneapi_cuda_cluster_group])>;
+    [
+      AspectFp16,
+      AspectAtomic64,
+      AspectExt_oneapi_cuda_async_barrier,
+      AspectExt_oneapi_cuda_cluster_group,
+      AspectExt_codeplay_cuda_tensor_map,
+    ])>;
 
 //
 // HIP / AMDGPU device aspects
diff --git a/sycl/doc/extensions/experimental/sycl_ext_codeplay_cuda_tensor_map.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_codeplay_cuda_tensor_map.asciidoc
new file mode 100644
index 0000000000000..3f89b734915e3
--- /dev/null
+++ b/sycl/doc/extensions/experimental/sycl_ext_codeplay_cuda_tensor_map.asciidoc
@@ -0,0 +1,367 @@
+= sycl_ext_codeplay_cuda_tensor_map
+
+:source-highlighter: coderay
+:coderay-linenums-mode: table
+
+// This section needs to be after the document title.
+:doctype: book
+:toc2:
+:toc: left
+:encoding: utf-8
+:lang: en
+:dpcpp: pass:[DPC++]
+:cuda-guide-using-tma: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#using-tma-to-transfer-multi-dimensional-arrays
+:cuda-guide-async-copies: https://docs.nvidia.com/cuda/cuda-c-programming-guide/#asynchronous-data-copies-using-tensor-memory-access-tma
+
+// Set the default source code type in this document to C++,
+// for syntax highlighting purposes.  This is needed because
+// docbook uses c++ and html5 uses cpp.
+:language: {basebackend@docbook:c++:cpp}
+
+== Notice
+
+[%hardbreaks]
+Copyright (C) Codeplay Software Limited.  All rights reserved.
+
+Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks
+of The Khronos Group Inc.  OpenCL(TM) is a trademark of Apple Inc. used by
+permission by Khronos.
+
+== Contact
+
+To report problems with this extension, please open a new issue at:
+
+https://github.com/intel/llvm/issues
+
+== Dependencies
+
+This extension is written against the SYCL 2020 revision 9 specification.  All
+references below to the "core SYCL specification" or to section numbers in the
+SYCL specification refer to that revision.
+
+== Status
+
+This is an experimental extension specification, intended to provide early
+access to features and gather community feedback. Interfaces defined in
+this specification are implemented in {dpcpp}, but they are not finalized
+and may change incompatibly in future versions of {dpcpp} without prior notice.
+*Shipping software products should not rely on APIs defined in
+this specification.*
+
+[NOTE]
+====
+This extension is currently implemented in {dpcpp} only for NVIDIA GPU devices
+with Compute Capability of 9.0 or above and only when using the CUDA backend.
+
+====
+
+== Introduction
+
+This document describes an extension that adds interfaces enabling OneAPI
+implementers access to CUDA's Tensor Map Access (TMA) APIs from within SYCL
+kernels. These interfaces provide utilities to enable accelerated copies of
+multidimensional arrays of various types. There is no novelty here; only the
+plumbing needed to access accelerated features.
+
+== Specification
+
+=== Feature Test Macro
+
+This extension provides a feature-test macro as described in the core SYCL
+specification section 6.3.3 "Feature test macros".  Therefore, an
+implementation supporting this extension must predefine the macro
+`SYCL_EXT_CODEPLAY_CUDA_TENSOR_MAP` to one of the values defined in the table
+below. Applications can test for the existence of this macro to determine if
+the implementation supports this feature, or applications can test the macro's
+value to determine which of the extension's APIs the implementation supports.
+
+[%header,cols="1,5"]
+|===
+|Value
+|Description
+
+|1
+|Initial version of this extension
+|===
+
+=== Overview
+
+Certain libraries shipped with OneAPI may need access to proprietary accelerator
+extensions to enable good performance via use of driver-provided features that
+are beyond the reach of the compiler or language model. One such extension is
+CUDA's link:{cuda-guide-async-copies}[Tensor Memory Access] interface which
+enables accelerated copies of multidimensional arrays. This is *not* a feature
+that can be implemented in the compiler directly. This is because the parameters
+for the tensor copy adhere to an unusual ABI and the context type used for
+initializing such operations requires a call to the driver from the host. The
+initialized data can then *only* be acted upon within the device context. Thus:
+gaining access to accelerated "tensor" copies within SYCL requires interfaces
+that emulates this pattern in such a way it's possible to gain access to the
+user data from within kernels.
+
+[NOTE]
+====
+These interfaces are for initializing asynchronous *multidimensional* array
+copies only. One dimensional array copies should be preferably performed using
+standard sycl memcpy features.
+
+====
+
+==== Interface
+
+`sycl_ext_codeplay_cuda_tensor_map` defines two classes and a number of
+enumerations for initializing the CUDA TMA context objects
+
+classes:
+
+- `tiled_encode_map`
+- `im2col_encode_map`
+
+enumerations
+
+- `datatype`
+- `interleave`
+- `swizzle`
+- `l2_promote`
+- `oob_fill`
+
+They are analogous to their CUDA namesakes. The size, alignment and layout of
+the structs are unspecified.
+
+Each class has a single constructor whose parameters control the tensor copy
+operation implied by the class name.
+
+
+The `tiled_encode_map` class is used to initialize CUDA state used for tiled
+copies of multidimensional arrays
+Its arguments are analogous to its CUDA namesake
+link:{cuda-guide-using-tma}[c.f. CUDA programming guide]
+
+  tiled_encode_map(queue &q, void *addr, datatype type, uint32_t rank,
+                   const uint64_t global_dims[/*rank*/],
+                   const uint64_t global_strides[/*rank - 1*/],
+                   const uint32_t box_dims[/*rank*/],
+                   const uint32_t element_strides[/*rank*/],
+                   interleave interleave, swizzle swizzle, l2_promote promote,
+                   oob_fill oob_fill);
+
+The `im2col_encode_map` class is used to initialize CUDA state used for
+asynchronous copies with a re-encoding of blocks to columns.
+Its arguments are analagous to its CUDA namesake
+link:{cuda-guide-using-tma}[c.f CUDA programming guide]
+
+When passed to a kernel by value, either of these class objects can then have
+their address taken and passed as the second operand of the
+`cp.async.bulk.tensor` family of PTX instructions via inline assembly.
+No other operation is supported.
+
+These objects can be constructed in host code only. It is undefined to attempt
+to construct them in a kernel. Using the address of copied objects for the
+CUDA tensor operations is undefined.
+
+==== Sample Header for host code only
+
+[source, c++]
+----
+namespace sycl::ext::codeplay::experimental::cuda {
+  enum datatype : int {
+    type_uint8,
+    type_uint16,
+    type_uint32,
+    type_int32,
+    type_uint64,
+    type_int64,
+    type_float16,
+    type_float32,
+    type_float64,
+    type_bfloat16,
+    type_float32_ftz,
+    type_tfloat32,
+    type_tfloat32_ftz,
+  };
+  enum interleave : int {
+    interleave_none,
+    interleave_16,
+    interleave_32,
+  };
+  enum swizzle : int {
+    swizzle_none,
+    swizzle_32,
+    swizzle_64,
+    swizzle_128,
+  };
+  enum l2_promote : int {
+    promote_none,
+    promote_l2_64,
+    promote_l2_128,
+    promote_l2_256,
+  };
+  enum oob_fill : int {
+    oob_fill_none,
+    oob_fill_nan_request_zero_fma,
+  };
+struct tiled_encode_map {
+  tiled_encode_map(queue &q, void *addr, datatype type, uint32_t rank,
+                   const uint64_t global_dims[/*rank*/],
+                   const uint64_t global_strides[/*rank - 1*/],
+                   const uint32_t box_dims[/*rank*/],
+                   const uint32_t element_strides[/*rank*/],
+                   interleave interleave, swizzle swizzle, l2_promote promote,
+                   oob_fill oob_fill);
+private:
+// Implementation defined members must be private
+};
+
+struct im2col_encode_map {
+  enum datatype : int {
+    type_uint8,
+    type_uint16,
+    type_uint32,
+    type_int32,
+    type_uint64,
+    type_int64,
+    type_float16,
+    type_float32,
+    type_float64,
+    type_bfloat16,
+    type_float32_ftz,
+    type_tfloat32,
+    type_tfloat32_ftz,
+  };
+  enum interleave : int {
+    interleave_none,
+    interleave_16,
+    interleave_32,
+  };
+  enum swizzle : int {
+    swizzle_none,
+    swizzle_32,
+    swizzle_64,
+    swizzle_128,
+  };
+  enum l2_promote : int {
+    promote_none,
+    promote_l2_64,
+    promote_l2_128,
+    promote_l2_256,
+  };
+  enum oob_fill : int {
+    oob_fill_none,
+    oob_fill_nan_request_zero_fma,
+  };
+// Implementation defined members must be private
+  im2col_encode_map(queue &q, datatype type, uint32_t rank, void *addr,
+                    const uint64_t gmem_dims[/*rank*/],
+                    const uint64_t gmem_strides[/*rank - 1*/],
+                    const int32_t pixel_box_lower_corner[/*rank*/],
+                    const int32_t pixel_box_upper_corner[/*rank*/],
+                    uint32_t channels_per_pixel, uint32_t pixels_per_col,
+                    const uint32_t element_strides[/*rank*/],
+                    interleave interleave, swizzle swizzle, l2_promote promote,
+                    oob_fill oob_fill);
+};
+}
+----
+
+==== Sample Header for device code only
+
+[source, c++]
+----
+namespace sycl::ext::codeplay::experimental::cuda {
+class tiled_encode_map {
+public:
+  // Get access to the TMA descriptor for use as an operand to the
+  // cp.async.bulk.tensor family of PTX instructions
+  uintptr_t get_native_descriptor();
+};
+class im2col_encode_map {
+  // Get access to the TMA descriptor for use as an operand to the
+  // cp.async.bulk.tensor family of PTX instructions
+ uintptr_t get_native_descriptor();
+};
+}
+----
+
+== Examples
+
+[source, c++]
+----
+#include <cstdint>
+#include <cassert>
+
+#include <sycl/sycl.hpp>
+#include <sycl/ext/codeplay/experimental/cuda_tensor_map.hpp>
+#include <sycl/ext/oneapi/work_group_static.hpp>
+#include <sycl/ext/oneapi/work_group_scratch_memory.hpp>
+
+using namespace sycl;
+using namespace sycl::ext::codeplay::experimental::cuda;
+namespace sycl_ext = sycl::ext::oneapi::experimental;
+#define rank 2
+
+#define WIDTH (256)
+#define HEIGHT (8)
+int main() {
+  device cuda_dev{
+    [](const sycl::device &dev) {
+      return dev.get_backend() == sycl::backend::ext_oneapi_cuda ? 1 : -1;
+    }
+  };
+  bool has_aspect = cuda_dev.has(aspect::ext_codeplay_cuda_tensor_map);
+  assert(has_aspect);
+  queue q{cuda_dev};
+  auto *mem = sycl::malloc_device<int32_t>(WIDTH * HEIGHT, q);
+
+  uint64_t global_dims[rank] = {WIDTH, HEIGHT};
+  uint64_t global_strides[rank - 1] = {WIDTH};
+  uint32_t box_dims[rank] = {WIDTH / 2, HEIGHT / 2};
+  uint32_t element_strides[rank] = {1, 1};
+
+  tiled_encode_map tile(
+    q,
+    static_cast<void *>(mem),
+    tiled_encode_map::datatype::type_int32,
+    rank,
+    global_dims,
+    global_strides,
+    box_dims,
+    element_strides,
+    tiled_encode_map::interleave::interleave_none,
+    tiled_encode_map::swizzle::swizzle_none,
+    tiled_encode_map::l2_promote::promote_none,
+    tiled_encode_map::oob_fill::oob_fill_none
+  );
+
+  q.submit([&](handler &Cgh) {
+    sycl_ext::work_group_scratch_size static_size{WIDTH * HEIGHT * sizeof (int32_t)};
+    sycl_ext::properties properties{static_size};
+    cgh.parallel_for(nd_range<1>(range<1>(Size), range<1>(WgSize)), properties,
+                                [=](nd_item<1> Item) {
+        sycl_ext::work_group_static<int64_t> barrier_mem;
+        auto smem_ptr = reinterpret_cast<uintptr_t>(sycl_ext::static_address_cast<
+                      sycl::access::address_space::local_space>(sycl_ext::get_work_group_scratch_memory()).get_decorated());
+        auto bar_ptr = reinterpret_cast<uintptr_t>(sycl_ext::static_address_cast<
+                      sycl::access::address_space::local_space>(&barrier_mem).get_decorated())
+        (void)tile;
+        (void)shmem;
+#ifdef __SYCL_DEVICE_ONLY__
+  uint32_t smem_int_bar = 0;
+  int32_t tc0 = 0;
+  int32_t tc1 = 0;
+  asm volatile (
+    "cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes"
+    " [%[smem_int_ptr]], [%[tma_descriptor], {%[tc0], %[tc1]}], [%[bar_ptr]];"
+    :
+    : [smem_int_ptr] "r" (smem_ptr),
+      [tma_descriptor] "l" (tile.get_native_descriptor()),
+      [bar_ptr] "r" (bar_ptr),
+      [tc0] "r" (tc0),
+      [tc1] "r" (tc1)
+    : "memory"
+  );
+#endif
+    });
+    // Do stuff with shared memory now...
+  }).wait();
+}
+----
diff --git a/sycl/include/sycl/ext/codeplay/experimental/cuda_tensor_map.hpp b/sycl/include/sycl/ext/codeplay/experimental/cuda_tensor_map.hpp
new file mode 100644
index 0000000000000..89b477e336c53
--- /dev/null
+++ b/sycl/include/sycl/ext/codeplay/experimental/cuda_tensor_map.hpp
@@ -0,0 +1,117 @@
+//==----------------- tensor_map.hpp --- CUDA TMA interop wrappers ---------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstdint>
+
+#include <sycl/detail/export.hpp>
+
+#define SYCL_EXT_CODEPLAY_CUDA_TENSOR_MAP 1
+
+namespace sycl {
+inline namespace _V1 {
+class queue;
+namespace ext::codeplay::experimental::cuda {
+namespace detail {
+/// An opaque type passed to the runtime used to describe the properties of an
+/// image.
+
+struct alignas(64) __tensor_copy_descriptor {
+protected:
+  unsigned char data[128];
+
+public:
+  // It'd be nice to shorten these enumeration names a little, but since many of
+  // them start with numbers, that'd be an illegal name, and nobody is going to
+  // prefer typing `tensor_copy_descriptor::interleave::sixteen` over
+  // `tensor_copy_descriptor::interleave_16`. Additionally, naming the type
+  // enumerations after the type they represent is sketchy since there are so
+  // many variations of uint32 et al in the wild. Thus: in the name of
+  // consistency all enumerators here duplicatively encode the type in their
+  // names
+  enum datatype : int {
+    type_uint8,
+    type_uint16,
+    type_uint32,
+    type_int32,
+    type_uint64,
+    type_int64,
+    type_float16,
+    type_float32,
+    type_float64,
+    type_bfloat16,
+    type_float32_ftz,
+    type_tfloat32,
+    type_tfloat32_ftz,
+  };
+  enum interleave : int {
+    interleave_none,
+    interleave_16,
+    interleave_32,
+  };
+  enum swizzle : int {
+    swizzle_none,
+    swizzle_32,
+    swizzle_64,
+    swizzle_128,
+  };
+  enum l2_promote : int {
+    promote_none,
+    promote_l2_64,
+    promote_l2_128,
+    promote_l2_256,
+  };
+  enum oob_fill : int {
+    oob_fill_none,
+    oob_fill_nan_request_zero_fma,
+  };
+};
+} // namespace detail
+
+struct __SYCL_EXPORT tiled_encode_map final
+    : public detail::__tensor_copy_descriptor {
+  tiled_encode_map() = delete;
+  // Can't be constructed on device, only passed into kernels from the host
+  tiled_encode_map(queue &q, void *addr, datatype type, uint32_t rank,
+                   const uint64_t global_dims[/*rank*/],
+                   const uint64_t global_strides[/*rank - 1*/],
+                   const uint32_t box_dims[/*rank*/],
+                   const uint32_t element_strides[/*rank*/],
+                   interleave interleave, swizzle swizzle, l2_promote promote,
+                   oob_fill oob_fill);
+#ifdef __SYCL_DEVICE_ONLY__
+  uintptr_t get_native_descriptor() const {
+    return reinterpret_cast<uintptr_t>(this);
+  }
+#endif
+};
+
+struct __SYCL_EXPORT im2col_encode_map final
+    : public detail::__tensor_copy_descriptor {
+  im2col_encode_map() = delete;
+  // Can't be constructed on device, only passed into kernels from the host
+  im2col_encode_map(queue &q, datatype type, uint32_t rank, void *addr,
+                    const uint64_t gmem_dims[/*rank*/],
+                    const uint64_t gmem_strides[/*rank - 1*/],
+                    const int32_t pixel_box_lower_corner[/*rank*/],
+                    const int32_t pixel_box_upper_corner[/*rank*/],
+                    uint32_t channels_per_pixel, uint32_t pixels_per_col,
+                    const uint32_t element_strides[/*rank*/],
+                    interleave interleave, swizzle swizzle, l2_promote promote,
+                    oob_fill oob_fill);
+#ifdef __SYCL_DEVICE_ONLY__
+  uintptr_t get_native_descriptor() const {
+    return reinterpret_cast<uintptr_t>(this);
+  }
+#endif
+};
+
+} // namespace ext::codeplay::experimental::cuda
+} // namespace _V1
+} // namespace sycl
diff --git a/sycl/include/sycl/info/aspects.def b/sycl/include/sycl/info/aspects.def
index 8a931dde35a71..fe0072d3ea473 100644
--- a/sycl/include/sycl/info/aspects.def
+++ b/sycl/include/sycl/info/aspects.def
@@ -72,3 +72,4 @@ __SYCL_ASPECT(ext_oneapi_bindless_images_sample_1d_usm, 78)
 __SYCL_ASPECT(ext_oneapi_bindless_images_sample_2d_usm, 79)
 __SYCL_ASPECT(ext_oneapi_atomic16, 80)
 __SYCL_ASPECT(ext_oneapi_virtual_functions, 81)
+__SYCL_ASPECT(ext_codeplay_cuda_tensor_map, 82)
diff --git a/sycl/source/CMakeLists.txt b/sycl/source/CMakeLists.txt
index 7bb35a9e158cd..c42fc870d4fa8 100644
--- a/sycl/source/CMakeLists.txt
+++ b/sycl/source/CMakeLists.txt
@@ -285,6 +285,7 @@ set(SYCL_COMMON_SOURCES
     "detail/scheduler/graph_builder.cpp"
     "detail/spec_constant_impl.cpp"
     "detail/sycl_mem_obj_t.cpp"
+    "detail/cuda_tensor_map.cpp"
     "detail/usm/usm_impl.cpp"
     "detail/ur.cpp"
     "detail/util.cpp"
diff --git a/sycl/source/detail/cuda_tensor_map.cpp b/sycl/source/detail/cuda_tensor_map.cpp
new file mode 100644
index 0000000000000..1ab5e18a6bb52
--- /dev/null
+++ b/sycl/source/detail/cuda_tensor_map.cpp
@@ -0,0 +1,198 @@
+//==----------------- tensor_map.cpp --- CUDA TMA interop wrappers ---------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+#include "detail/device_impl.hpp"
+#include <sycl/aspects.hpp>
+#include <sycl/detail/ur.hpp>
+#include <sycl/ext/codeplay/experimental/cuda_tensor_map.hpp>
+#include <sycl/queue.hpp>
+
+namespace {
+using tcd =
+    sycl::ext::codeplay::experimental::cuda::detail::__tensor_copy_descriptor;
+
+static inline ur_device_handle_t get_ur_device(sycl::queue &q) {
+  return sycl::detail::getSyclObjImpl(q.get_device())->getHandleRef();
+}
+
+static inline sycl::detail::AdapterPtr get_adapter(sycl::queue &q) {
+  return sycl::detail::getSyclObjImpl(q.get_device())->getAdapter();
+}
+// n.b. none of these enum converters have a default switch label so we get
+// missing enumeration warnings if new enumerations are added to the underlying
+// type
+static inline ur_exp_tensor_map_data_type_flags_t
+datatype_to_ur(tcd::datatype type) {
+  switch (type) {
+  case tcd::datatype::type_uint8:
+    return UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT8;
+  case tcd::datatype::type_uint16:
+    return UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT16;
+  case tcd::datatype::type_uint32:
+    return UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT32;
+  case tcd::datatype::type_int32:
+    return UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_INT32;
+  case tcd::datatype::type_uint64:
+    return UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT64;
+  case tcd::datatype::type_int64:
+    return UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_INT64;
+  case tcd::datatype::type_float16:
+    return UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT16;
+  case tcd::datatype::type_float32:
+    return UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT32;
+  case tcd::datatype::type_float64:
+    return UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT64;
+  case tcd::datatype::type_bfloat16:
+    return UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_BFLOAT16;
+  case tcd::datatype::type_float32_ftz:
+    return UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT32_FTZ;
+  case tcd::datatype::type_tfloat32:
+    return UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_TFLOAT32;
+  case tcd::datatype::type_tfloat32_ftz:
+    return UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_TFLOAT32_FTZ;
+  }
+  throw sycl::exception(sycl::errc::invalid);
+}
+
+static inline ur_exp_tensor_map_interleave_flags_t
+interleave_to_ur(tcd::interleave interleave) {
+  switch (interleave) {
+  case tcd::interleave::interleave_none:
+    return UR_EXP_TENSOR_MAP_INTERLEAVE_FLAG_NONE;
+  case tcd::interleave::interleave_16:
+    return UR_EXP_TENSOR_MAP_INTERLEAVE_FLAG_16B;
+  case tcd::interleave::interleave_32:
+    return UR_EXP_TENSOR_MAP_INTERLEAVE_FLAG_32B;
+  }
+  throw sycl::exception(sycl::errc::invalid);
+}
+
+static inline ur_exp_tensor_map_swizzle_flags_t
+swizzle_to_ur(tcd::swizzle swizzle) {
+  switch (swizzle) {
+  case tcd::swizzle::swizzle_none:
+    return UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_NONE;
+  case tcd::swizzle::swizzle_32:
+    return UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_32B;
+  case tcd::swizzle::swizzle_64:
+    return UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_64B;
+  case tcd::swizzle::swizzle_128:
+    return UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_128B;
+  }
+  throw sycl::exception(sycl::errc::invalid);
+}
+
+static inline ur_exp_tensor_map_l2_promotion_flags_t
+l2_promote_to_ur(tcd::l2_promote promote) {
+  switch (promote) {
+  case tcd::l2_promote::promote_none:
+    return UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_NONE;
+  case tcd::l2_promote::promote_l2_64:
+    return UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_64B;
+  case tcd::l2_promote::promote_l2_128:
+    return UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_128B;
+  case tcd::l2_promote::promote_l2_256:
+    return UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_256B;
+  }
+  throw sycl::exception(sycl::errc::invalid);
+}
+static inline ur_exp_tensor_map_oob_fill_flags_t
+oob_fill_to_ur(tcd::oob_fill fill) {
+  switch (fill) {
+  case tcd::oob_fill::oob_fill_none:
+    return UR_EXP_TENSOR_MAP_OOB_FILL_FLAG_NONE;
+  case tcd::oob_fill::oob_fill_nan_request_zero_fma:
+    return UR_EXP_TENSOR_MAP_OOB_FILL_FLAG_REQUEST_ZERO_FMA;
+  }
+  throw sycl::exception(sycl::errc::invalid);
+}
+} // namespace
+
+namespace sycl {
+inline namespace _V1 {
+namespace ext::codeplay::experimental::cuda {
+tiled_encode_map::tiled_encode_map(queue &q, void *addr, datatype type,
+                                   uint32_t rank,
+                                   const uint64_t global_dims[/*rank*/],
+                                   const uint64_t global_strides[/*rank - 1*/],
+                                   const uint32_t box_dims[/*rank*/],
+                                   const uint32_t element_strides[/*rank*/],
+                                   interleave interleave, swizzle swizzle,
+                                   l2_promote promote, oob_fill oob_fill) {
+  // This static assertion looks a bit funny, due to some fun C++ "features".
+  // We want to ensure that passing this struct around to kernels works as
+  // expected (LLVM byval for aggregates in __GRID_CONSTANT__ memory). For that
+  // to work, the tensor map data space must be the first member of the struct.
+  // We can't use offsetof here because of visibility (only works with public
+  // visibility (and it's not really legal for non POD types)).
+  // We also can't compare pointer differences statically e.g. assert(this ==
+  // data)
+  // Thus the only thing I can think of to make this validation staticallly is
+  // to assert that the size of the class is the size of its only member, which
+  // guarantees the offset is zero.
+  static_assert(sizeof *this == sizeof data,
+                "the tensor data must be at offset zero for correct "
+                "kernel parameter passing");
+
+  if (!q.get_device().has(sycl::aspect::ext_codeplay_cuda_tensor_map)) {
+    throw sycl::exception(
+        make_error_code(errc::feature_not_supported),
+        "Tensor maps are only supported on CUDA GPUs with SM >= 90");
+  }
+
+  auto ur_device_handle = get_ur_device(q);
+  // XXX This pointer-to-pointer is gross, but the DDI layer generation doesn't
+  // support opaque types because it needs to allocate the base type.
+  auto *ur_tensor_map =
+      reinterpret_cast<ur_exp_tensor_map_handle_t>(this->data);
+
+  auto ur_type = datatype_to_ur(type);
+  auto ur_swizzle = swizzle_to_ur(swizzle);
+  auto ur_interleave = interleave_to_ur(interleave);
+  auto ur_promote = l2_promote_to_ur(promote);
+  auto ur_fill = oob_fill_to_ur(oob_fill);
+
+  get_adapter(q)
+      ->call<sycl::errc::invalid,
+             sycl::detail::UrApiKind::urTensorMapEncodeTiledExp>(
+          ur_device_handle, ur_type, rank, addr, global_dims, global_strides,
+          box_dims, element_strides, ur_interleave, ur_swizzle, ur_promote,
+          ur_fill, &ur_tensor_map);
+}
+
+im2col_encode_map::im2col_encode_map(
+    queue &q, datatype type, uint32_t rank, void *addr,
+    const uint64_t gmem_dims[/*rank*/],
+    const uint64_t gmem_strides[/*rank - 1*/],
+    const int32_t pixel_box_lower_corner[/*rank*/],
+    const int32_t pixel_box_upper_corner[/*rank*/], uint32_t channels_per_pixel,
+    uint32_t pixels_per_col, const uint32_t element_strides[/*rank*/],
+    interleave interleave, swizzle swizzle, l2_promote promote,
+    oob_fill oob_fill) {
+  auto ur_device_handle = get_ur_device(q);
+  // XXX This pointer-to-pointer is gross, but the DDI layer generation doesn't
+  // support opaque types because it needs to allocate the base type.
+  auto *ur_tensor_map =
+      reinterpret_cast<ur_exp_tensor_map_handle_t>(this->data);
+
+  auto ur_type = datatype_to_ur(type);
+  auto ur_swizzle = swizzle_to_ur(swizzle);
+  auto ur_interleave = interleave_to_ur(interleave);
+  auto ur_promote = l2_promote_to_ur(promote);
+  auto ur_fill = oob_fill_to_ur(oob_fill);
+  get_adapter(q)
+      ->call<sycl::errc::runtime,
+             sycl::detail::UrApiKind::urTensorMapEncodeIm2ColExp>(
+          ur_device_handle, ur_type, rank, addr, gmem_dims, gmem_strides,
+          pixel_box_lower_corner, pixel_box_upper_corner, channels_per_pixel,
+          pixels_per_col, element_strides, ur_interleave, ur_swizzle,
+          ur_promote, ur_fill, &ur_tensor_map);
+}
+} // namespace ext::codeplay::experimental::cuda
+} // namespace _V1
+} // namespace sycl
diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp
index 205f5d14eada2..568cd02ae6a43 100644
--- a/sycl/source/detail/device_impl.cpp
+++ b/sycl/source/detail/device_impl.cpp
@@ -473,6 +473,21 @@ bool device_impl::has(aspect Aspect) const {
     return get_info<info::device::ext_oneapi_srgb>();
   case aspect::ext_oneapi_native_assert:
     return isAssertFailSupported();
+  case aspect::ext_codeplay_cuda_tensor_map: {
+    using arch = sycl::ext::oneapi::experimental::architecture;
+    const arch supported_archs[] = {
+        arch::nvidia_gpu_sm_90,
+        arch::nvidia_gpu_sm_90a,
+    };
+    try {
+      return std::any_of(
+          std::begin(supported_archs), std::end(supported_archs),
+          [this](const arch a) { return this->extOneapiArchitectureIs(a); });
+    } catch (const sycl::exception &) {
+      return false;
+    }
+    return false;
+  }
   case aspect::ext_oneapi_cuda_async_barrier: {
     int async_barrier_supported;
     bool call_successful =
diff --git a/sycl/test-e2e/Basic/aspects.cpp b/sycl/test-e2e/Basic/aspects.cpp
index ea1bbec27762d..3c0b5ba89b8ac 100644
--- a/sycl/test-e2e/Basic/aspects.cpp
+++ b/sycl/test-e2e/Basic/aspects.cpp
@@ -90,6 +90,9 @@ int main() {
     if (plt.has(aspect::ext_oneapi_virtual_functions)) {
       std::cout << "  ext_oneapi_virtual_functions" << std::endl;
     }
+    if (plt.has(aspect::ext_codeplay_cuda_tensor_map)) {
+      std::cout << "  ext_codeplay_cuda_tensor_map" << std::endl;
+    }
   }
   std::cout << "Passed." << std::endl;
   return 0;
diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump
index 8d27788b92758..72104afd2b893 100644
--- a/sycl/test/abi/sycl_symbols_linux.dump
+++ b/sycl/test/abi/sycl_symbols_linux.dump
@@ -4028,3 +4028,7 @@ _ZNK4sycl3_V19kernel_id8get_nameEv
 _ZNKSt4hashIN4sycl3_V15queueEEclERKS2_
 __sycl_register_lib
 __sycl_unregister_lib
+_ZN4sycl3_V13ext8codeplay12experimental4cuda16tiled_encode_mapC2ERNS0_5queueEPvNS4_6detail24__tensor_copy_descriptor8datatypeEjPKmSD_PKjSF_NSA_10interleaveENSA_7swizzleENSA_10l2_promoteENSA_8oob_fillE
+_ZN4sycl3_V13ext8codeplay12experimental4cuda17im2col_encode_mapC1ERNS0_5queueENS4_6detail24__tensor_copy_descriptor8datatypeEjPvPKmSD_PKiSF_jjPKjNS9_10interleaveENS9_7swizzleENS9_10l2_promoteENS9_8oob_fillE
+_ZN4sycl3_V13ext8codeplay12experimental4cuda17im2col_encode_mapC2ERNS0_5queueENS4_6detail24__tensor_copy_descriptor8datatypeEjPvPKmSD_PKiSF_jjPKjNS9_10interleaveENS9_7swizzleENS9_10l2_promoteENS9_8oob_fillE
+_ZN4sycl3_V13ext8codeplay12experimental4cuda16tiled_encode_mapC1ERNS0_5queueEPvNS4_6detail24__tensor_copy_descriptor8datatypeEjPKmSD_PKjSF_NSA_10interleaveENSA_7swizzleENSA_10l2_promoteENSA_8oob_fillE
diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump
index e8d3186745074..730d686d495b0 100644
--- a/sycl/test/abi/sycl_symbols_windows.dump
+++ b/sycl/test/abi/sycl_symbols_windows.dump
@@ -7,6 +7,12 @@
 # REQUIRES: windows
 # UNSUPPORTED: libcxx
 
+??4im2col_encode_map@cuda@experimental@codeplay@ext@_V1@sycl@@QEAAAEAU0123456@$$QEAU0123456@@Z
+??4im2col_encode_map@cuda@experimental@codeplay@ext@_V1@sycl@@QEAAAEAU0123456@AEBU0123456@@Z
+??0im2col_encode_map@cuda@experimental@codeplay@ext@_V1@sycl@@QEAA@AEAVqueue@56@W4datatype@__tensor_copy_descriptor@detail@123456@IPEAXQEB_K3QEBH4IIQEBIW4interleave@9detail@123456@W4swizzle@9detail@123456@W4l2_promote@9detail@123456@W4oob_fill@9detail@123456@@Z
+??4tiled_encode_map@cuda@experimental@codeplay@ext@_V1@sycl@@QEAAAEAU0123456@AEBU0123456@@Z
+??4tiled_encode_map@cuda@experimental@codeplay@ext@_V1@sycl@@QEAAAEAU0123456@$$QEAU0123456@@Z
+??0tiled_encode_map@cuda@experimental@codeplay@ext@_V1@sycl@@QEAA@AEAVqueue@56@PEAXW4datatype@__tensor_copy_descriptor@detail@123456@IQEB_K3QEBI4W4interleave@9detail@123456@W4swizzle@9detail@123456@W4l2_promote@9detail@123456@W4oob_fill@9detail@123456@@Z
 ??$compile@V?$vector@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@V?$allocator@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@2@@std@@@?$online_compiler@$00@experimental@intel@ext@_V1@sycl@@QEAA?AV?$vector@EV?$allocator@E@std@@@std@@AEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@7@AEBV?$vector@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@V?$allocator@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@2@@7@@Z
 ??$compile@V?$vector@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@V?$allocator@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@2@@std@@@?$online_compiler@$0A@@experimental@intel@ext@_V1@sycl@@QEAA?AV?$vector@EV?$allocator@E@std@@@std@@AEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@7@AEBV?$vector@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@V?$allocator@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@2@@7@@Z
 ??$create_sub_devices@$0BAIG@@device@_V1@sycl@@QEBA?AV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@std@@_K@Z
diff --git a/sycl/test/extensions/cuda_tensor_map/macros_ext_cuda_tensor_map.cpp b/sycl/test/extensions/cuda_tensor_map/macros_ext_cuda_tensor_map.cpp
new file mode 100644
index 0000000000000..5bb47825c3ec8
--- /dev/null
+++ b/sycl/test/extensions/cuda_tensor_map/macros_ext_cuda_tensor_map.cpp
@@ -0,0 +1,9 @@
+// RUN: %clangxx -fsycl -fsyntax-only %s
+#include <sycl/ext/codeplay/experimental/cuda_tensor_map.hpp>
+
+#ifndef SYCL_EXT_CODEPLAY_CUDA_TENSOR_MAP
+#error SYCL_EXT_CODEPLAY_CUDA_TENSOR_MAP is not defined
+#endif
+#if SYCL_EXT_CODEPLAY_CUDA_TENSOR_MAP != 1
+#error SYCL_EXT_CODEPLAY_CUDA_TENSOR_MAP has unexpected value
+#endif