[SYCL][ext] Define and Implement sycl_ext_tensor_map

This is a fairly mechanical implementation of the basic infrastructure required to access CUDA TMA descriptors from within SYCL kernels, while initializing them on the host. The new feature exposes two new classes and associated support structure in `sycl::ext::codeplay::experimental::cuda`. There's some ugliness involved to make this work on account of the way NVIDIA implemented this basic feature, but it's all in the name of {legitimate-field-of-endeavour}.
intel · Dec 4, 2024 · ad4d6a8 · ad4d6a8
1 parent 5e0db3e
commit ad4d6a8
Show file tree

Hide file tree

Showing 9 changed files with 725 additions and 3 deletions.
diff --git a/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td b/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td
@@ -86,6 +86,8 @@ def AspectExt_oneapi_cuda_cluster_group : Aspect<"ext_oneapi_cuda_cluster_group"
 def AspectExt_intel_fpga_task_sequence : Aspect<"ext_intel_fpga_task_sequence">;
 def AspectExt_oneapi_atomic16 : Aspect<"ext_oneapi_atomic16">;
 def AspectExt_oneapi_virtual_functions : Aspect<"ext_oneapi_virtual_functions">;
+def AspectExt_codeplay_cuda_tensor_map : Aspect<"ext_codeplay_cuda_tensor_map">;
+
 // Deprecated aspects
 def AspectInt64_base_atomics : Aspect<"int64_base_atomics">;
 def AspectInt64_extended_atomics : Aspect<"int64_extended_atomics">;
@@ -150,7 +152,9 @@ def : TargetInfo<"__TestAspectList",
     AspectExt_oneapi_queue_profiling_tag, AspectExt_oneapi_virtual_mem, AspectExt_oneapi_cuda_cluster_group, 
     AspectExt_intel_fpga_task_sequence,
     AspectExt_oneapi_atomic16,
-    AspectExt_oneapi_virtual_functions],
+    AspectExt_oneapi_virtual_functions,
+    AspectExt_codeplay_cuda_tensor_map,
+    ],
     []>;
 // This definition serves the only purpose of testing whether the deprecated aspect list defined in here and in SYCL RT
 // match.
@@ -259,9 +263,21 @@ def : CudaTargetInfo<"nvidia_gpu_sm_87", !listconcat(CudaMinAspects, CudaBindles
 def : CudaTargetInfo<"nvidia_gpu_sm_89", !listconcat(CudaMinAspects, CudaBindlessImagesAspects,
     [AspectFp16, AspectAtomic64, AspectExt_oneapi_cuda_async_barrier])>;
 def : CudaTargetInfo<"nvidia_gpu_sm_90", !listconcat(CudaMinAspects, CudaBindlessImagesAspects,
-    [AspectFp16, AspectAtomic64, AspectExt_oneapi_cuda_async_barrier, AspectExt_oneapi_cuda_cluster_group])>;
+    [
+      AspectFp16,
+      AspectAtomic64,
+      AspectExt_oneapi_cuda_async_barrier,
+      AspectExt_oneapi_cuda_cluster_group,
+      AspectExt_codeplay_cuda_tensor_map,
+    ])>;
 def : CudaTargetInfo<"nvidia_gpu_sm_90a", !listconcat(CudaMinAspects, CudaBindlessImagesAspects,
-    [AspectFp16, AspectAtomic64, AspectExt_oneapi_cuda_async_barrier, AspectExt_oneapi_cuda_cluster_group])>;
+    [
+      AspectFp16,
+      AspectAtomic64,
+      AspectExt_oneapi_cuda_async_barrier,
+      AspectExt_oneapi_cuda_cluster_group,
+      AspectExt_codeplay_cuda_tensor_map,
+    ])>;
 
 //
 // HIP / AMDGPU device aspects