intel · v-klochkov · Jan 2, 2024 · Dec 25, 2023 · Dec 25, 2023 · Dec 26, 2023
@@ -471,6 +471,21 @@ lsc_format_ret(__ESIMD_NS::simd<T1, N> Vals) {
   }
 }
 
+template <typename T> constexpr uint32_t get_lsc_data_size() {
+  switch (sizeof(T)) {
+  case 1:
+    return 0;
+  case 2:
+    return 1;
+  case 4:
+    return 2;
+  case 8:
+    return 3;
+  default:
+    static_assert(true, "Unsupported data type.");
+  }
+}
+
 template <cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none>
 constexpr uint32_t get_lsc_load_cache_mask() {
   if constexpr (L1H == cache_hint::read_invalidate &&
@@ -1992,16 +2007,17 @@ template <typename T, int BlockWidth, int BlockHeight = 1, int NBlocks = 1,
 __ESIMD_API __ESIMD_NS::simd<T, N>
 lsc_load_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight,
             unsigned SurfacePitch, int X, int Y) {
+  using RawT = __ESIMD_DNS::__raw_t<T>;
   detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
-  detail::check_lsc_block_2d_restrictions<T, BlockWidth, BlockHeight, NBlocks,
-                                          Transposed, Transformed,
+  detail::check_lsc_block_2d_restrictions<RawT, BlockWidth, BlockHeight,
+                                          NBlocks, Transposed, Transformed,
                                           detail::block_2d_op::load>();
   // For Load BlockWidth is padded up to the next power-of-two value.
   // For Load with Transpose the pre-operation BlockHeight is padded up
   // to the next power-of-two value.
   // For Load with Transform pre-operation BlockHeight is padded up to
   // multiple of K, where K = 4B / sizeof(T).
-  constexpr int ElemsPerDword = 4 / sizeof(T);
+  constexpr int ElemsPerDword = 4 / sizeof(RawT);
   constexpr int GRFRowSize = Transposed    ? BlockHeight
                              : Transformed ? BlockWidth * ElemsPerDword
                                            : BlockWidth;
@@ -2013,7 +2029,7 @@ lsc_load_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight,
                          : BlockHeight);
   constexpr int GRFBlockSize = GRFRowPitch * GRFColSize;
   constexpr int GRFBlockPitch =
-      detail::roundUpNextMultiple<64 / sizeof(T), GRFBlockSize>();
+      detail::roundUpNextMultiple<64 / sizeof(RawT), GRFBlockSize>();
   constexpr int ActualN = NBlocks * GRFBlockPitch;
 
   constexpr int DstBlockElements = GRFColSize * GRFRowSize;
@@ -2022,14 +2038,14 @@ lsc_load_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight,
   static_assert(N == ActualN || N == DstElements, "Incorrect element count");
 
   constexpr lsc_data_size DS =
-      detail::finalize_data_size<T, lsc_data_size::default_size>();
+      detail::finalize_data_size<RawT, lsc_data_size::default_size>();
   __ESIMD_NS::simd_mask<ActualN> pred = 1;
   uintptr_t surf_addr = reinterpret_cast<uintptr_t>(Ptr);
   constexpr detail::lsc_data_order _Transposed =
       Transposed ? detail::lsc_data_order::transpose
                  : detail::lsc_data_order::nontranspose;
-  __ESIMD_NS::simd<T, ActualN> Raw =
-      __esimd_lsc_load2d_stateless<T, L1H, L3H, DS, _Transposed, NBlocks,
+  __ESIMD_NS::simd<RawT, ActualN> Raw =
+      __esimd_lsc_load2d_stateless<RawT, L1H, L3H, DS, _Transposed, NBlocks,
                                    BlockWidth, BlockHeight, Transformed,
                                    ActualN>(pred.data(), surf_addr,
                                             SurfaceWidth, SurfaceHeight,
@@ -2055,16 +2071,17 @@ lsc_load_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight,
     // +----+----+----+----+----+----+-----+-----+
     // * signifies the padded element.
 
-    __ESIMD_NS::simd<T, DstElements> Dst;
+    __ESIMD_NS::simd<RawT, DstElements> Dst;
 
     for (auto i = 0; i < NBlocks; i++) {
       auto DstBlock =
           Dst.template select<DstBlockElements, 1>(i * DstBlockElements);
 
       auto RawBlock = Raw.template select<GRFBlockSize, 1>(i * GRFBlockPitch);
-      DstBlock = RawBlock.template bit_cast_view<T, GRFColSize, GRFRowPitch>()
-                     .template select<GRFColSize, 1, GRFRowSize, 1>(0, 0)
-                     .template bit_cast_view<T>();
+      DstBlock =
+          RawBlock.template bit_cast_view<RawT, GRFColSize, GRFRowPitch>()
+              .template select<GRFColSize, 1, GRFRowSize, 1>(0, 0)
+              .template bit_cast_view<RawT>();
     }
 
     return Dst;
@@ -2146,30 +2163,32 @@ template <typename T, int BlockWidth, int BlockHeight = 1,
 __ESIMD_API void lsc_store_2d(T *Ptr, unsigned SurfaceWidth,
                               unsigned SurfaceHeight, unsigned SurfacePitch,
                               int X, int Y, __ESIMD_NS::simd<T, N> Vals) {
+  using RawT = __ESIMD_DNS::__raw_t<T>;
   detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
-  detail::check_lsc_block_2d_restrictions<T, BlockWidth, BlockHeight, 1, false,
-                                          false, detail::block_2d_op::store>();
+  detail::check_lsc_block_2d_restrictions<RawT, BlockWidth, BlockHeight, 1,
+                                          false, false,
+                                          detail::block_2d_op::store>();
   constexpr lsc_data_size DS =
-      detail::finalize_data_size<T, lsc_data_size::default_size>();
+      detail::finalize_data_size<RawT, lsc_data_size::default_size>();
   uintptr_t surf_addr = reinterpret_cast<uintptr_t>(Ptr);
   constexpr detail::lsc_data_order _Transposed =
       detail::lsc_data_order::nontranspose;
 
   constexpr int Pitch = __ESIMD_DNS::getNextPowerOf2<BlockWidth>();
-  __ESIMD_NS::simd<T, BlockHeight * Pitch> Raw;
+  __ESIMD_NS::simd<RawT, BlockHeight * Pitch> Raw;
 
   if constexpr (BlockHeight * Pitch == N) {
     Raw = Vals;
   } else {
     // For store with padding, allocate the block with padding, and place
     // original data there.
-    auto Data2D = Vals.template bit_cast_view<T, BlockHeight, BlockWidth>();
-    auto Raw2D = Raw.template bit_cast_view<T, BlockHeight, Pitch>();
+    auto Data2D = Vals.template bit_cast_view<RawT, BlockHeight, BlockWidth>();
+    auto Raw2D = Raw.template bit_cast_view<RawT, BlockHeight, Pitch>();
     Raw2D.template select<BlockHeight, 1, BlockWidth, 1>(0, 0) = Data2D;
   }
 
   __ESIMD_NS::simd_mask<BlockHeight * Pitch> pred = 1;
-  __esimd_lsc_store2d_stateless<T, L1H, L3H, DS, _Transposed, 1u, BlockWidth,
+  __esimd_lsc_store2d_stateless<RawT, L1H, L3H, DS, _Transposed, 1u, BlockWidth,
                                 BlockHeight, false, BlockHeight * Pitch>(
       pred.data(), surf_addr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y,
       Raw.data());
@@ -2428,17 +2447,23 @@ ESIMD_INLINE SYCL_ESIMD_FUNCTION __ESIMD_NS::simd<T, N> lsc_load_2d(
   constexpr int DstBlockElements = GRFColSize * GRFRowSize;
   constexpr int DstElements = DstBlockElements * NBlocks;
 
+  constexpr uint32_t DstLength = DstBlockElements * sizeof(T) / 32;
+  constexpr uint32_t DstLengthMask = DstLength == 0    ? 1 << 20
+                                     : DstLength == 32 ? 31 << 20
+                                                       : DstLength << 20;
+
   static_assert(N == ActualN || N == DstElements, "Incorrect element count");
 
   constexpr uint32_t cache_mask = detail::get_lsc_load_cache_mask<L1H, L3H>()
                                   << 17;
-  constexpr uint32_t base_desc = 0x2800403;
+  constexpr uint32_t base_desc = 0x2000003;
   constexpr uint32_t transformMask = Transformed ? 1 << 7 : 0;
   constexpr uint32_t transposeMask = Transposed ? 1 << 15 : 0;
+  constexpr uint32_t dataSizeMask = detail::get_lsc_data_size<T>() << 9;
   __ESIMD_NS::simd<T, N> oldDst;
   constexpr uint32_t exDesc = 0x0;
-  constexpr uint32_t desc =
-      base_desc | cache_mask | transformMask | transposeMask;
+  constexpr uint32_t desc = base_desc | cache_mask | transformMask |
+                            transposeMask | dataSizeMask | DstLengthMask;
   constexpr uint8_t execSize = 1;
   constexpr uint8_t sfid = 0xF;
   constexpr uint8_t numSrc0 = 0x1;
@@ -2500,12 +2525,13 @@ ESIMD_INLINE SYCL_ESIMD_FUNCTION void lsc_prefetch_2d(
                 "Transposed and transformed is not supported");
   constexpr uint32_t cache_mask = detail::get_lsc_load_cache_mask<L1H, L3H>()
                                   << 17;
-  constexpr uint32_t base_desc = 0x2000403;
+  constexpr uint32_t dataSizeMask = detail::get_lsc_data_size<T>() << 9;
+  constexpr uint32_t base_desc = 0x2000003;
   constexpr uint32_t transformMask = Transformed ? 1 << 7 : 0;
   constexpr uint32_t transposeMask = Transposed ? 1 << 15 : 0;
   constexpr uint32_t exDesc = 0x0;
   constexpr uint32_t desc =
-      base_desc | cache_mask | transformMask | transposeMask;
+      base_desc | cache_mask | transformMask | transposeMask | dataSizeMask;
   constexpr uint8_t execSize = 1;
   constexpr uint8_t sfid = 0xF;
   constexpr uint8_t numDst = (N * sizeof(T)) / 64;
@@ -2542,10 +2568,11 @@ lsc_store_2d(config_2d_mem_access<T, BlockWidth, BlockHeight, NBlocks> &payload,
 
   constexpr uint32_t cache_mask = detail::get_lsc_store_cache_mask<L1H, L3H>()
                                   << 17;
-  constexpr uint32_t base_desc = 0x2000407;
+  constexpr uint32_t dataSizeMask = detail::get_lsc_data_size<T>() << 9;
+  constexpr uint32_t base_desc = 0x2000007;
 
   constexpr uint32_t exDesc = 0x0;
-  constexpr uint32_t desc = base_desc | cache_mask;
+  constexpr uint32_t desc = base_desc | cache_mask | dataSizeMask;
   constexpr uint8_t execSize = 1;
   constexpr uint8_t sfid = 0xF;
   constexpr uint8_t numSrc0 = 0x1;

@@ -0,0 +1,81 @@
+//==----- lsc_load_2d_compare.cpp - DPC++ ESIMD on-device test ------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: gpu-intel-pvc
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// The tests makes sure old and new load_2d API produce identical
+// results.
+#include <iostream>
+#include <sycl/ext/intel/esimd.hpp>
+#include <sycl/sycl.hpp>
+
+using bf16 = sycl::ext::oneapi::bfloat16;
+using namespace sycl;
+using namespace sycl::ext::intel::esimd;
+using namespace sycl::ext::intel::experimental::esimd;
+template <typename T> bool test() {
+  sycl::queue Q(sycl::gpu_selector_v);
+  auto dev = Q.get_device();
+  std::cout << "Running on " << dev.get_info<sycl::info::device::name>()
+            << "\n";
+
+  constexpr int TM = 8;
+  constexpr int TN = 8;
+  constexpr int NBLOCKS = 2;
+  constexpr int WIDTH = 2 * TN;
+  constexpr int HEIGHT = TM;
+  constexpr int PITCH = WIDTH;
+  constexpr int SIZE = WIDTH * HEIGHT;
+
+  auto *A = malloc_shared<T>(SIZE, Q);
+  auto *B = malloc_shared<T>(SIZE, Q);
+  auto *C = malloc_shared<T>(SIZE, Q);
+  auto *C1 = malloc_shared<T>(SIZE, Q);
+
+  for (int i = 0; i < SIZE; i++) {
+    A[i] = static_cast<T>(i);
+  }
+
+  Q.parallel_for(sycl::nd_range<1>(1, 1), [=](sycl::nd_item<1>
+                                                  item) SYCL_ESIMD_KERNEL {
+     config_2d_mem_access<T, TN, TM, NBLOCKS> my_config(
+         A, WIDTH * sizeof(T) - 1, HEIGHT - 1, PITCH * sizeof(T) - 1, 0, 0);
+
+     simd<T, NBLOCKS * TM * TN> tmp =
+         lsc_load_2d<T, TN, TM, NBLOCKS, false, false>(my_config);
+     simd<T, NBLOCKS * TM * TN> tmp1 = lsc_load_2d<T, TN, TM, NBLOCKS>(
+         my_config.get_data_pointer(), my_config.get_surface_width(),
+         my_config.get_surface_height(), my_config.get_surface_pitch(),
+         my_config.get_x(), my_config.get_y());
+
+     tmp.copy_to(C);
+     tmp1.copy_to(C1);
+   }).wait();
+
+  bool error = false;
+  for (auto i = 0; i < SIZE; ++i)
+    error |= C[i] != C1[i];
+
+  free(A, Q);
+  free(C, Q);
+  free(C1, Q);
+  return error;
+}
+
+int main() {
+  bool result = false;
+  result |= test<float>();
+  result |= test<uint32_t>();
+  result |= test<uint16_t>();
+  result |= test<uint8_t>();
+  result |= test<sycl::half>();
+
+  std::cout << (result ? "FAILED" : "passed") << std::endl;
+  return 0;
+}
@@ -20,67 +20,67 @@ using namespace sycl;
 using namespace sycl::ext::intel::esimd;
 using namespace sycl::ext::intel::experimental::esimd;
 
-int main() {
-  constexpr uint32_t BlockWidth = 16;
-  constexpr uint32_t BlockHeight = 4;
+template <typename T> bool test() {
+  constexpr uint32_t BlockWidth = 8;
+  constexpr uint32_t BlockHeight = 8;
   constexpr uint32_t NumBlocks = 1;
-  constexpr uint32_t SurfaceHeight = 16;
-  constexpr uint32_t SurfaceWidth = 16;
-  constexpr uint32_t SurfacePitch = 16;
+  constexpr uint32_t SurfaceHeight = BlockHeight;
+  constexpr uint32_t SurfaceWidth = BlockWidth * NumBlocks;
+  constexpr uint32_t SurfacePitch = BlockWidth;
   constexpr uint32_t x = 0;
   constexpr uint32_t y = 0;
 
-  constexpr uint32_t Size = SurfacePitch * SurfaceHeight * NumBlocks;
+  constexpr uint32_t Size = SurfaceWidth * SurfaceHeight;
 
   queue q;
   auto dev = q.get_device();
   std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
-  auto *A = malloc_shared<float>(Size, q);
-  auto *B = malloc_shared<float>(Size, q);
-  auto *C = malloc_shared<float>(Size, q);
-  auto *C1 = malloc_shared<float>(Size, q);
+  auto *A = malloc_shared<T>(Size, q);
+  auto *B = malloc_shared<T>(Size, q);
+  auto *C = malloc_shared<T>(Size, q);
+  auto *C1 = malloc_shared<T>(Size, q);
 
   for (auto i = 0; i != Size; i++) {
-    A[i] = B[i] = 7;
+    A[i] = B[i] = i;
     C[i] = C1[i] = 0;
   }
 
   auto e = q.submit([&](handler &cgh) {
-    cgh.parallel_for<class Test>(range<1>{1}, [=](id<1> i) SYCL_ESIMD_KERNEL {
-      constexpr uint32_t width = SurfaceWidth * sizeof(float) - 1;
+    cgh.parallel_for(range<1>{1}, [=](id<1> i) SYCL_ESIMD_KERNEL {
+      constexpr uint32_t width = SurfaceWidth * sizeof(T) - 1;
       constexpr uint32_t height = SurfaceHeight - 1;
-      constexpr uint32_t pitch = SurfacePitch * sizeof(float) - 1;
-      auto data_a = lsc_load_2d<float, BlockWidth, BlockHeight, NumBlocks>(
+      constexpr uint32_t pitch = SurfacePitch * sizeof(T) - 1;
+      simd<T, Size> data_a = lsc_load_2d<T, BlockWidth, BlockHeight, NumBlocks>(
           A, width, height, pitch, x, y);
-      auto data_b = lsc_load_2d<float, BlockWidth, BlockHeight, NumBlocks>(
+      simd<T, Size> data_b = lsc_load_2d<T, BlockWidth, BlockHeight, NumBlocks>(
           B, width, height, pitch, x, y);
 
-      auto data_c = data_a + data_b;
+      simd<T, Size> data_c = data_a + data_b;
 
-      lsc_store_2d<float, BlockWidth, BlockHeight>(C, width, height, pitch, x,
-                                                   y, data_c);
+      lsc_store_2d<T, BlockWidth, BlockHeight>(C, width, height, pitch, x, y,
+                                               data_c);
     });
   });
   e.wait();
 
   auto e1 = q.submit([&](handler &cgh) {
-    cgh.parallel_for<class Test1>(range<1>{1}, [=](id<1> i) SYCL_ESIMD_KERNEL {
-      constexpr uint32_t width = SurfaceWidth * sizeof(float) - 1;
+    cgh.parallel_for(range<1>{1}, [=](id<1> i) SYCL_ESIMD_KERNEL {
+      constexpr uint32_t width = SurfaceWidth * sizeof(T) - 1;
       constexpr uint32_t height = SurfaceHeight - 1;
-      constexpr uint32_t pitch = SurfacePitch * sizeof(float) - 1;
+      constexpr uint32_t pitch = SurfacePitch * sizeof(T) - 1;
 
-      config_2d_mem_access<float, BlockWidth, BlockHeight, NumBlocks> payload(
+      config_2d_mem_access<T, BlockWidth, BlockHeight, NumBlocks> payload(
           A, width, height, pitch, 0, 0);
-      lsc_prefetch_2d<float, BlockWidth, BlockHeight, NumBlocks, false, false,
+      lsc_prefetch_2d<T, BlockWidth, BlockHeight, NumBlocks, false, false,
                       cache_hint::cached, cache_hint::cached>(payload);
-      auto data_a = lsc_load_2d(payload);
+      simd<T, Size> data_a = lsc_load_2d(payload);
 
       payload.set_data_pointer(B);
-      lsc_prefetch_2d<float, BlockWidth, BlockHeight, NumBlocks, false, false,
+      lsc_prefetch_2d<T, BlockWidth, BlockHeight, NumBlocks, false, false,
                       cache_hint::cached, cache_hint::cached>(payload);
-      auto data_b = lsc_load_2d(payload);
+      simd<T, Size> data_b = lsc_load_2d(payload);
 
-      auto data_c = data_a + data_b;
+      simd<T, Size> data_c = data_a + data_b;
 
       payload.set_data_pointer(C1);
 
@@ -89,14 +89,27 @@ int main() {
   });
   e1.wait();
 
-  auto error = 0;
+  bool error = false;
   for (auto i = 0; i < Size; ++i)
-    error += std::abs(C[i] - C1[i]);
+    error |= C[i] != C1[i];
 
   free(A, q);
   free(B, q);
   free(C, q);
   free(C1, q);
-  std::cout << (error != 0 ? "FAILED" : "passed") << std::endl;
+  return error;
+}
+
+int main() {
+  bool result = false;
+  result |= test<float>();
+  result |= test<uint32_t>();
+  result |= test<uint16_t>();
+  result |= test<uint64_t>();
+  result |= test<double>();
+  result |= test<uint8_t>();
+  result |= test<sycl::half>();
+
+  std::cout << (result ? "FAILED" : "passed") << std::endl;
   return 0;
 }