Skip to content

[SYCL][ESIMD] Fix lsc_load_2d API issue that prevented usage for different types #12244

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jan 2, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 52 additions & 25 deletions sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -471,6 +471,21 @@ lsc_format_ret(__ESIMD_NS::simd<T1, N> Vals) {
}
}

template <typename T> constexpr uint32_t get_lsc_data_size() {
switch (sizeof(T)) {
case 1:
return 0;
case 2:
return 1;
case 4:
return 2;
case 8:
return 3;
default:
static_assert(true, "Unsupported data type.");
}
}

template <cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none>
constexpr uint32_t get_lsc_load_cache_mask() {
if constexpr (L1H == cache_hint::read_invalidate &&
Expand Down Expand Up @@ -1992,16 +2007,17 @@ template <typename T, int BlockWidth, int BlockHeight = 1, int NBlocks = 1,
__ESIMD_API __ESIMD_NS::simd<T, N>
lsc_load_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight,
unsigned SurfacePitch, int X, int Y) {
using RawT = __ESIMD_DNS::__raw_t<T>;
detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
detail::check_lsc_block_2d_restrictions<T, BlockWidth, BlockHeight, NBlocks,
Transposed, Transformed,
detail::check_lsc_block_2d_restrictions<RawT, BlockWidth, BlockHeight,
NBlocks, Transposed, Transformed,
detail::block_2d_op::load>();
// For Load BlockWidth is padded up to the next power-of-two value.
// For Load with Transpose the pre-operation BlockHeight is padded up
// to the next power-of-two value.
// For Load with Transform pre-operation BlockHeight is padded up to
// multiple of K, where K = 4B / sizeof(T).
constexpr int ElemsPerDword = 4 / sizeof(T);
constexpr int ElemsPerDword = 4 / sizeof(RawT);
constexpr int GRFRowSize = Transposed ? BlockHeight
: Transformed ? BlockWidth * ElemsPerDword
: BlockWidth;
Expand All @@ -2013,7 +2029,7 @@ lsc_load_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight,
: BlockHeight);
constexpr int GRFBlockSize = GRFRowPitch * GRFColSize;
constexpr int GRFBlockPitch =
detail::roundUpNextMultiple<64 / sizeof(T), GRFBlockSize>();
detail::roundUpNextMultiple<64 / sizeof(RawT), GRFBlockSize>();
constexpr int ActualN = NBlocks * GRFBlockPitch;

constexpr int DstBlockElements = GRFColSize * GRFRowSize;
Expand All @@ -2022,14 +2038,14 @@ lsc_load_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight,
static_assert(N == ActualN || N == DstElements, "Incorrect element count");

constexpr lsc_data_size DS =
detail::finalize_data_size<T, lsc_data_size::default_size>();
detail::finalize_data_size<RawT, lsc_data_size::default_size>();
__ESIMD_NS::simd_mask<ActualN> pred = 1;
uintptr_t surf_addr = reinterpret_cast<uintptr_t>(Ptr);
constexpr detail::lsc_data_order _Transposed =
Transposed ? detail::lsc_data_order::transpose
: detail::lsc_data_order::nontranspose;
__ESIMD_NS::simd<T, ActualN> Raw =
__esimd_lsc_load2d_stateless<T, L1H, L3H, DS, _Transposed, NBlocks,
__ESIMD_NS::simd<RawT, ActualN> Raw =
__esimd_lsc_load2d_stateless<RawT, L1H, L3H, DS, _Transposed, NBlocks,
BlockWidth, BlockHeight, Transformed,
ActualN>(pred.data(), surf_addr,
SurfaceWidth, SurfaceHeight,
Expand All @@ -2055,16 +2071,17 @@ lsc_load_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight,
// +----+----+----+----+----+----+-----+-----+
// * signifies the padded element.

__ESIMD_NS::simd<T, DstElements> Dst;
__ESIMD_NS::simd<RawT, DstElements> Dst;

for (auto i = 0; i < NBlocks; i++) {
auto DstBlock =
Dst.template select<DstBlockElements, 1>(i * DstBlockElements);

auto RawBlock = Raw.template select<GRFBlockSize, 1>(i * GRFBlockPitch);
DstBlock = RawBlock.template bit_cast_view<T, GRFColSize, GRFRowPitch>()
.template select<GRFColSize, 1, GRFRowSize, 1>(0, 0)
.template bit_cast_view<T>();
DstBlock =
RawBlock.template bit_cast_view<RawT, GRFColSize, GRFRowPitch>()
.template select<GRFColSize, 1, GRFRowSize, 1>(0, 0)
.template bit_cast_view<RawT>();
}

return Dst;
Expand Down Expand Up @@ -2146,30 +2163,32 @@ template <typename T, int BlockWidth, int BlockHeight = 1,
__ESIMD_API void lsc_store_2d(T *Ptr, unsigned SurfaceWidth,
unsigned SurfaceHeight, unsigned SurfacePitch,
int X, int Y, __ESIMD_NS::simd<T, N> Vals) {
using RawT = __ESIMD_DNS::__raw_t<T>;
detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
detail::check_lsc_block_2d_restrictions<T, BlockWidth, BlockHeight, 1, false,
false, detail::block_2d_op::store>();
detail::check_lsc_block_2d_restrictions<RawT, BlockWidth, BlockHeight, 1,
false, false,
detail::block_2d_op::store>();
constexpr lsc_data_size DS =
detail::finalize_data_size<T, lsc_data_size::default_size>();
detail::finalize_data_size<RawT, lsc_data_size::default_size>();
uintptr_t surf_addr = reinterpret_cast<uintptr_t>(Ptr);
constexpr detail::lsc_data_order _Transposed =
detail::lsc_data_order::nontranspose;

constexpr int Pitch = __ESIMD_DNS::getNextPowerOf2<BlockWidth>();
__ESIMD_NS::simd<T, BlockHeight * Pitch> Raw;
__ESIMD_NS::simd<RawT, BlockHeight * Pitch> Raw;

if constexpr (BlockHeight * Pitch == N) {
Raw = Vals;
} else {
// For store with padding, allocate the block with padding, and place
// original data there.
auto Data2D = Vals.template bit_cast_view<T, BlockHeight, BlockWidth>();
auto Raw2D = Raw.template bit_cast_view<T, BlockHeight, Pitch>();
auto Data2D = Vals.template bit_cast_view<RawT, BlockHeight, BlockWidth>();
auto Raw2D = Raw.template bit_cast_view<RawT, BlockHeight, Pitch>();
Raw2D.template select<BlockHeight, 1, BlockWidth, 1>(0, 0) = Data2D;
}

__ESIMD_NS::simd_mask<BlockHeight * Pitch> pred = 1;
__esimd_lsc_store2d_stateless<T, L1H, L3H, DS, _Transposed, 1u, BlockWidth,
__esimd_lsc_store2d_stateless<RawT, L1H, L3H, DS, _Transposed, 1u, BlockWidth,
BlockHeight, false, BlockHeight * Pitch>(
pred.data(), surf_addr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y,
Raw.data());
Expand Down Expand Up @@ -2428,17 +2447,23 @@ ESIMD_INLINE SYCL_ESIMD_FUNCTION __ESIMD_NS::simd<T, N> lsc_load_2d(
constexpr int DstBlockElements = GRFColSize * GRFRowSize;
constexpr int DstElements = DstBlockElements * NBlocks;

constexpr uint32_t DstLength = DstBlockElements * sizeof(T) / 32;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@vmustya - can you please take a look at this block of code and the initialization of 'desc' at L2463 ?

Copy link
Contributor

@vmustya vmustya Dec 26, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

According to the hardware spec, dest size is encoded in units of registers.

constexpr GrfBytes = 64; /// for PVC&Xe2+ 
constexpr auto DstBlockElements = GrfColSize * GrfRowSize;
constexpr auto DstBlockSize = align(DstBlockElements * sizeof(T), GrfBytes); /// each block is register-aligned, there may be cross-block padding present.

constexpr auto DstElements = std::min(31, DstBlockSize / GrfBytes); /// Dst length of 32 is also encoded as 31.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed

constexpr uint32_t DstLengthMask = DstLength == 0 ? 1 << 20
: DstLength == 32 ? 31 << 20
: DstLength << 20;

static_assert(N == ActualN || N == DstElements, "Incorrect element count");

constexpr uint32_t cache_mask = detail::get_lsc_load_cache_mask<L1H, L3H>()
<< 17;
constexpr uint32_t base_desc = 0x2800403;
constexpr uint32_t base_desc = 0x2000003;
constexpr uint32_t transformMask = Transformed ? 1 << 7 : 0;
constexpr uint32_t transposeMask = Transposed ? 1 << 15 : 0;
constexpr uint32_t dataSizeMask = detail::get_lsc_data_size<T>() << 9;
__ESIMD_NS::simd<T, N> oldDst;
constexpr uint32_t exDesc = 0x0;
constexpr uint32_t desc =
base_desc | cache_mask | transformMask | transposeMask;
constexpr uint32_t desc = base_desc | cache_mask | transformMask |
transposeMask | dataSizeMask | DstLengthMask;
constexpr uint8_t execSize = 1;
constexpr uint8_t sfid = 0xF;
constexpr uint8_t numSrc0 = 0x1;
Expand Down Expand Up @@ -2500,12 +2525,13 @@ ESIMD_INLINE SYCL_ESIMD_FUNCTION void lsc_prefetch_2d(
"Transposed and transformed is not supported");
constexpr uint32_t cache_mask = detail::get_lsc_load_cache_mask<L1H, L3H>()
<< 17;
constexpr uint32_t base_desc = 0x2000403;
constexpr uint32_t dataSizeMask = detail::get_lsc_data_size<T>() << 9;
constexpr uint32_t base_desc = 0x2000003;
constexpr uint32_t transformMask = Transformed ? 1 << 7 : 0;
constexpr uint32_t transposeMask = Transposed ? 1 << 15 : 0;
constexpr uint32_t exDesc = 0x0;
constexpr uint32_t desc =
base_desc | cache_mask | transformMask | transposeMask;
base_desc | cache_mask | transformMask | transposeMask | dataSizeMask;
constexpr uint8_t execSize = 1;
constexpr uint8_t sfid = 0xF;
constexpr uint8_t numDst = (N * sizeof(T)) / 64;
Expand Down Expand Up @@ -2542,10 +2568,11 @@ lsc_store_2d(config_2d_mem_access<T, BlockWidth, BlockHeight, NBlocks> &payload,

constexpr uint32_t cache_mask = detail::get_lsc_store_cache_mask<L1H, L3H>()
<< 17;
constexpr uint32_t base_desc = 0x2000407;
constexpr uint32_t dataSizeMask = detail::get_lsc_data_size<T>() << 9;
constexpr uint32_t base_desc = 0x2000007;

constexpr uint32_t exDesc = 0x0;
constexpr uint32_t desc = base_desc | cache_mask;
constexpr uint32_t desc = base_desc | cache_mask | dataSizeMask;
constexpr uint8_t execSize = 1;
constexpr uint8_t sfid = 0xF;
constexpr uint8_t numSrc0 = 0x1;
Expand Down
81 changes: 81 additions & 0 deletions sycl/test-e2e/ESIMD/lsc/lsc_load_2d_compare.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
//==----- lsc_load_2d_compare.cpp - DPC++ ESIMD on-device test ------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// REQUIRES: gpu-intel-pvc
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are in process of adding DG2 tests. I don't remember if load_2d supported by DG2. If Yes, then lease add one it for DG2. Perhaps this test will simply work for both DG2 and PVC

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tried to run it on dg2 and it gets stuck. There is probably another version of load_2d for dg2 or other platforms but this one appears to be for PVC only

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The untyped load/store/prefetch 2d block operations are not supported by DG2/MTL/ARL. They are only available for PVC and Xe2+.

// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

// The tests makes sure old and new load_2d API produce identical
// results.
#include <iostream>
#include <sycl/ext/intel/esimd.hpp>
#include <sycl/sycl.hpp>

using bf16 = sycl::ext::oneapi::bfloat16;
using namespace sycl;
using namespace sycl::ext::intel::esimd;
using namespace sycl::ext::intel::experimental::esimd;
template <typename T> bool test() {
sycl::queue Q(sycl::gpu_selector_v);
auto dev = Q.get_device();
std::cout << "Running on " << dev.get_info<sycl::info::device::name>()
<< "\n";

constexpr int TM = 8;
constexpr int TN = 8;
constexpr int NBLOCKS = 2;
constexpr int WIDTH = 2 * TN;
constexpr int HEIGHT = TM;
constexpr int PITCH = WIDTH;
constexpr int SIZE = WIDTH * HEIGHT;

auto *A = malloc_shared<T>(SIZE, Q);
auto *B = malloc_shared<T>(SIZE, Q);
auto *C = malloc_shared<T>(SIZE, Q);
auto *C1 = malloc_shared<T>(SIZE, Q);

for (int i = 0; i < SIZE; i++) {
A[i] = static_cast<T>(i);
}

Q.parallel_for(sycl::nd_range<1>(1, 1), [=](sycl::nd_item<1>
item) SYCL_ESIMD_KERNEL {
config_2d_mem_access<T, TN, TM, NBLOCKS> my_config(
A, WIDTH * sizeof(T) - 1, HEIGHT - 1, PITCH * sizeof(T) - 1, 0, 0);

simd<T, NBLOCKS * TM * TN> tmp =
lsc_load_2d<T, TN, TM, NBLOCKS, false, false>(my_config);
simd<T, NBLOCKS * TM * TN> tmp1 = lsc_load_2d<T, TN, TM, NBLOCKS>(
my_config.get_data_pointer(), my_config.get_surface_width(),
my_config.get_surface_height(), my_config.get_surface_pitch(),
my_config.get_x(), my_config.get_y());

tmp.copy_to(C);
tmp1.copy_to(C1);
}).wait();

bool error = false;
for (auto i = 0; i < SIZE; ++i)
error |= C[i] != C1[i];

free(A, Q);
free(C, Q);
free(C1, Q);
return error;
}

int main() {
bool result = false;
result |= test<float>();
result |= test<uint32_t>();
result |= test<uint16_t>();
result |= test<uint8_t>();
result |= test<sycl::half>();

std::cout << (result ? "FAILED" : "passed") << std::endl;
return 0;
}
77 changes: 45 additions & 32 deletions sycl/test-e2e/ESIMD/lsc/lsc_load_store_2d_compare.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,67 +20,67 @@ using namespace sycl;
using namespace sycl::ext::intel::esimd;
using namespace sycl::ext::intel::experimental::esimd;

int main() {
constexpr uint32_t BlockWidth = 16;
constexpr uint32_t BlockHeight = 4;
template <typename T> bool test() {
constexpr uint32_t BlockWidth = 8;
constexpr uint32_t BlockHeight = 8;
constexpr uint32_t NumBlocks = 1;
constexpr uint32_t SurfaceHeight = 16;
constexpr uint32_t SurfaceWidth = 16;
constexpr uint32_t SurfacePitch = 16;
constexpr uint32_t SurfaceHeight = BlockHeight;
constexpr uint32_t SurfaceWidth = BlockWidth * NumBlocks;
constexpr uint32_t SurfacePitch = BlockWidth;
constexpr uint32_t x = 0;
constexpr uint32_t y = 0;

constexpr uint32_t Size = SurfacePitch * SurfaceHeight * NumBlocks;
constexpr uint32_t Size = SurfaceWidth * SurfaceHeight;

queue q;
auto dev = q.get_device();
std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
auto *A = malloc_shared<float>(Size, q);
auto *B = malloc_shared<float>(Size, q);
auto *C = malloc_shared<float>(Size, q);
auto *C1 = malloc_shared<float>(Size, q);
auto *A = malloc_shared<T>(Size, q);
auto *B = malloc_shared<T>(Size, q);
auto *C = malloc_shared<T>(Size, q);
auto *C1 = malloc_shared<T>(Size, q);

for (auto i = 0; i != Size; i++) {
A[i] = B[i] = 7;
A[i] = B[i] = i;
C[i] = C1[i] = 0;
}

auto e = q.submit([&](handler &cgh) {
cgh.parallel_for<class Test>(range<1>{1}, [=](id<1> i) SYCL_ESIMD_KERNEL {
constexpr uint32_t width = SurfaceWidth * sizeof(float) - 1;
cgh.parallel_for(range<1>{1}, [=](id<1> i) SYCL_ESIMD_KERNEL {
constexpr uint32_t width = SurfaceWidth * sizeof(T) - 1;
constexpr uint32_t height = SurfaceHeight - 1;
constexpr uint32_t pitch = SurfacePitch * sizeof(float) - 1;
auto data_a = lsc_load_2d<float, BlockWidth, BlockHeight, NumBlocks>(
constexpr uint32_t pitch = SurfacePitch * sizeof(T) - 1;
simd<T, Size> data_a = lsc_load_2d<T, BlockWidth, BlockHeight, NumBlocks>(
A, width, height, pitch, x, y);
auto data_b = lsc_load_2d<float, BlockWidth, BlockHeight, NumBlocks>(
simd<T, Size> data_b = lsc_load_2d<T, BlockWidth, BlockHeight, NumBlocks>(
B, width, height, pitch, x, y);

auto data_c = data_a + data_b;
simd<T, Size> data_c = data_a + data_b;

lsc_store_2d<float, BlockWidth, BlockHeight>(C, width, height, pitch, x,
y, data_c);
lsc_store_2d<T, BlockWidth, BlockHeight>(C, width, height, pitch, x, y,
data_c);
});
});
e.wait();

auto e1 = q.submit([&](handler &cgh) {
cgh.parallel_for<class Test1>(range<1>{1}, [=](id<1> i) SYCL_ESIMD_KERNEL {
constexpr uint32_t width = SurfaceWidth * sizeof(float) - 1;
cgh.parallel_for(range<1>{1}, [=](id<1> i) SYCL_ESIMD_KERNEL {
constexpr uint32_t width = SurfaceWidth * sizeof(T) - 1;
constexpr uint32_t height = SurfaceHeight - 1;
constexpr uint32_t pitch = SurfacePitch * sizeof(float) - 1;
constexpr uint32_t pitch = SurfacePitch * sizeof(T) - 1;

config_2d_mem_access<float, BlockWidth, BlockHeight, NumBlocks> payload(
config_2d_mem_access<T, BlockWidth, BlockHeight, NumBlocks> payload(
A, width, height, pitch, 0, 0);
lsc_prefetch_2d<float, BlockWidth, BlockHeight, NumBlocks, false, false,
lsc_prefetch_2d<T, BlockWidth, BlockHeight, NumBlocks, false, false,
cache_hint::cached, cache_hint::cached>(payload);
auto data_a = lsc_load_2d(payload);
simd<T, Size> data_a = lsc_load_2d(payload);

payload.set_data_pointer(B);
lsc_prefetch_2d<float, BlockWidth, BlockHeight, NumBlocks, false, false,
lsc_prefetch_2d<T, BlockWidth, BlockHeight, NumBlocks, false, false,
cache_hint::cached, cache_hint::cached>(payload);
auto data_b = lsc_load_2d(payload);
simd<T, Size> data_b = lsc_load_2d(payload);

auto data_c = data_a + data_b;
simd<T, Size> data_c = data_a + data_b;

payload.set_data_pointer(C1);

Expand All @@ -89,14 +89,27 @@ int main() {
});
e1.wait();

auto error = 0;
bool error = false;
for (auto i = 0; i < Size; ++i)
error += std::abs(C[i] - C1[i]);
error |= C[i] != C1[i];

free(A, q);
free(B, q);
free(C, q);
free(C1, q);
std::cout << (error != 0 ? "FAILED" : "passed") << std::endl;
return error;
}

int main() {
bool result = false;
result |= test<float>();
result |= test<uint32_t>();
result |= test<uint16_t>();
result |= test<uint64_t>();
result |= test<double>();
result |= test<uint8_t>();
result |= test<sycl::half>();

std::cout << (result ? "FAILED" : "passed") << std::endl;
return 0;
}