Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCL][ESIMD] Fix lsc_load_2d API issue that prevented usage for different types #12244

Merged
merged 5 commits into from
Jan 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 54 additions & 25 deletions sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -471,6 +471,21 @@ lsc_format_ret(__ESIMD_NS::simd<T1, N> Vals) {
}
}

template <typename T> constexpr uint32_t get_lsc_data_size() {
switch (sizeof(T)) {
case 1:
return 0;
case 2:
return 1;
case 4:
return 2;
case 8:
return 3;
default:
static_assert(true, "Unsupported data type.");
}
}

template <cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none>
constexpr uint32_t get_lsc_load_cache_mask() {
if constexpr (L1H == cache_hint::read_invalidate &&
Expand Down Expand Up @@ -1992,16 +2007,17 @@ template <typename T, int BlockWidth, int BlockHeight = 1, int NBlocks = 1,
__ESIMD_API __ESIMD_NS::simd<T, N>
lsc_load_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight,
unsigned SurfacePitch, int X, int Y) {
using RawT = __ESIMD_DNS::__raw_t<T>;
detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
detail::check_lsc_block_2d_restrictions<T, BlockWidth, BlockHeight, NBlocks,
Transposed, Transformed,
detail::check_lsc_block_2d_restrictions<RawT, BlockWidth, BlockHeight,
NBlocks, Transposed, Transformed,
detail::block_2d_op::load>();
// For Load BlockWidth is padded up to the next power-of-two value.
// For Load with Transpose the pre-operation BlockHeight is padded up
// to the next power-of-two value.
// For Load with Transform pre-operation BlockHeight is padded up to
// multiple of K, where K = 4B / sizeof(T).
constexpr int ElemsPerDword = 4 / sizeof(T);
constexpr int ElemsPerDword = 4 / sizeof(RawT);
constexpr int GRFRowSize = Transposed ? BlockHeight
: Transformed ? BlockWidth * ElemsPerDword
: BlockWidth;
Expand All @@ -2013,7 +2029,7 @@ lsc_load_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight,
: BlockHeight);
constexpr int GRFBlockSize = GRFRowPitch * GRFColSize;
constexpr int GRFBlockPitch =
detail::roundUpNextMultiple<64 / sizeof(T), GRFBlockSize>();
detail::roundUpNextMultiple<64 / sizeof(RawT), GRFBlockSize>();
constexpr int ActualN = NBlocks * GRFBlockPitch;

constexpr int DstBlockElements = GRFColSize * GRFRowSize;
Expand All @@ -2022,14 +2038,14 @@ lsc_load_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight,
static_assert(N == ActualN || N == DstElements, "Incorrect element count");

constexpr lsc_data_size DS =
detail::finalize_data_size<T, lsc_data_size::default_size>();
detail::finalize_data_size<RawT, lsc_data_size::default_size>();
__ESIMD_NS::simd_mask<ActualN> pred = 1;
uintptr_t surf_addr = reinterpret_cast<uintptr_t>(Ptr);
constexpr detail::lsc_data_order _Transposed =
Transposed ? detail::lsc_data_order::transpose
: detail::lsc_data_order::nontranspose;
__ESIMD_NS::simd<T, ActualN> Raw =
__esimd_lsc_load2d_stateless<T, L1H, L3H, DS, _Transposed, NBlocks,
__ESIMD_NS::simd<RawT, ActualN> Raw =
__esimd_lsc_load2d_stateless<RawT, L1H, L3H, DS, _Transposed, NBlocks,
BlockWidth, BlockHeight, Transformed,
ActualN>(pred.data(), surf_addr,
SurfaceWidth, SurfaceHeight,
Expand All @@ -2055,16 +2071,17 @@ lsc_load_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight,
// +----+----+----+----+----+----+-----+-----+
// * signifies the padded element.

__ESIMD_NS::simd<T, DstElements> Dst;
__ESIMD_NS::simd<RawT, DstElements> Dst;

for (auto i = 0; i < NBlocks; i++) {
auto DstBlock =
Dst.template select<DstBlockElements, 1>(i * DstBlockElements);

auto RawBlock = Raw.template select<GRFBlockSize, 1>(i * GRFBlockPitch);
DstBlock = RawBlock.template bit_cast_view<T, GRFColSize, GRFRowPitch>()
.template select<GRFColSize, 1, GRFRowSize, 1>(0, 0)
.template bit_cast_view<T>();
DstBlock =
RawBlock.template bit_cast_view<RawT, GRFColSize, GRFRowPitch>()
.template select<GRFColSize, 1, GRFRowSize, 1>(0, 0)
.template bit_cast_view<RawT>();
}

return Dst;
Expand Down Expand Up @@ -2146,30 +2163,32 @@ template <typename T, int BlockWidth, int BlockHeight = 1,
__ESIMD_API void lsc_store_2d(T *Ptr, unsigned SurfaceWidth,
unsigned SurfaceHeight, unsigned SurfacePitch,
int X, int Y, __ESIMD_NS::simd<T, N> Vals) {
using RawT = __ESIMD_DNS::__raw_t<T>;
detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
detail::check_lsc_block_2d_restrictions<T, BlockWidth, BlockHeight, 1, false,
false, detail::block_2d_op::store>();
detail::check_lsc_block_2d_restrictions<RawT, BlockWidth, BlockHeight, 1,
false, false,
detail::block_2d_op::store>();
constexpr lsc_data_size DS =
detail::finalize_data_size<T, lsc_data_size::default_size>();
detail::finalize_data_size<RawT, lsc_data_size::default_size>();
uintptr_t surf_addr = reinterpret_cast<uintptr_t>(Ptr);
constexpr detail::lsc_data_order _Transposed =
detail::lsc_data_order::nontranspose;

constexpr int Pitch = __ESIMD_DNS::getNextPowerOf2<BlockWidth>();
__ESIMD_NS::simd<T, BlockHeight * Pitch> Raw;
__ESIMD_NS::simd<RawT, BlockHeight * Pitch> Raw;

if constexpr (BlockHeight * Pitch == N) {
Raw = Vals;
} else {
// For store with padding, allocate the block with padding, and place
// original data there.
auto Data2D = Vals.template bit_cast_view<T, BlockHeight, BlockWidth>();
auto Raw2D = Raw.template bit_cast_view<T, BlockHeight, Pitch>();
auto Data2D = Vals.template bit_cast_view<RawT, BlockHeight, BlockWidth>();
auto Raw2D = Raw.template bit_cast_view<RawT, BlockHeight, Pitch>();
Raw2D.template select<BlockHeight, 1, BlockWidth, 1>(0, 0) = Data2D;
}

__ESIMD_NS::simd_mask<BlockHeight * Pitch> pred = 1;
__esimd_lsc_store2d_stateless<T, L1H, L3H, DS, _Transposed, 1u, BlockWidth,
__esimd_lsc_store2d_stateless<RawT, L1H, L3H, DS, _Transposed, 1u, BlockWidth,
BlockHeight, false, BlockHeight * Pitch>(
pred.data(), surf_addr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y,
Raw.data());
Expand Down Expand Up @@ -2428,17 +2447,25 @@ ESIMD_INLINE SYCL_ESIMD_FUNCTION __ESIMD_NS::simd<T, N> lsc_load_2d(
constexpr int DstBlockElements = GRFColSize * GRFRowSize;
constexpr int DstElements = DstBlockElements * NBlocks;

constexpr uint32_t GrfBytes = 64;
constexpr uint32_t DstBlockSize =
detail::roundUpNextMultiple<DstElements * sizeof(T), GrfBytes>();
constexpr uint32_t DstLength =
(DstBlockSize / GrfBytes) > 31 ? 31 : (DstBlockSize / GrfBytes);
constexpr uint32_t DstLengthMask = DstLength << 20;

static_assert(N == ActualN || N == DstElements, "Incorrect element count");

constexpr uint32_t cache_mask = detail::get_lsc_load_cache_mask<L1H, L3H>()
<< 17;
constexpr uint32_t base_desc = 0x2800403;
constexpr uint32_t base_desc = 0x2000003;
constexpr uint32_t transformMask = Transformed ? 1 << 7 : 0;
constexpr uint32_t transposeMask = Transposed ? 1 << 15 : 0;
constexpr uint32_t dataSizeMask = detail::get_lsc_data_size<T>() << 9;
__ESIMD_NS::simd<T, N> oldDst;
constexpr uint32_t exDesc = 0x0;
constexpr uint32_t desc =
base_desc | cache_mask | transformMask | transposeMask;
constexpr uint32_t desc = base_desc | cache_mask | transformMask |
transposeMask | dataSizeMask | DstLengthMask;
constexpr uint8_t execSize = 1;
constexpr uint8_t sfid = 0xF;
constexpr uint8_t numSrc0 = 0x1;
Expand Down Expand Up @@ -2500,12 +2527,13 @@ ESIMD_INLINE SYCL_ESIMD_FUNCTION void lsc_prefetch_2d(
"Transposed and transformed is not supported");
constexpr uint32_t cache_mask = detail::get_lsc_load_cache_mask<L1H, L3H>()
<< 17;
constexpr uint32_t base_desc = 0x2000403;
constexpr uint32_t dataSizeMask = detail::get_lsc_data_size<T>() << 9;
constexpr uint32_t base_desc = 0x2000003;
constexpr uint32_t transformMask = Transformed ? 1 << 7 : 0;
constexpr uint32_t transposeMask = Transposed ? 1 << 15 : 0;
constexpr uint32_t exDesc = 0x0;
constexpr uint32_t desc =
base_desc | cache_mask | transformMask | transposeMask;
base_desc | cache_mask | transformMask | transposeMask | dataSizeMask;
constexpr uint8_t execSize = 1;
constexpr uint8_t sfid = 0xF;
constexpr uint8_t numDst = (N * sizeof(T)) / 64;
Expand Down Expand Up @@ -2542,10 +2570,11 @@ lsc_store_2d(config_2d_mem_access<T, BlockWidth, BlockHeight, NBlocks> &payload,

constexpr uint32_t cache_mask = detail::get_lsc_store_cache_mask<L1H, L3H>()
<< 17;
constexpr uint32_t base_desc = 0x2000407;
constexpr uint32_t dataSizeMask = detail::get_lsc_data_size<T>() << 9;
constexpr uint32_t base_desc = 0x2000007;

constexpr uint32_t exDesc = 0x0;
constexpr uint32_t desc = base_desc | cache_mask;
constexpr uint32_t desc = base_desc | cache_mask | dataSizeMask;
constexpr uint8_t execSize = 1;
constexpr uint8_t sfid = 0xF;
constexpr uint8_t numSrc0 = 0x1;
Expand Down
81 changes: 81 additions & 0 deletions sycl/test-e2e/ESIMD/lsc/lsc_load_2d_compare.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
//==----- lsc_load_2d_compare.cpp - DPC++ ESIMD on-device test ------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// REQUIRES: gpu-intel-pvc
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are in process of adding DG2 tests. I don't remember if load_2d supported by DG2. If Yes, then lease add one it for DG2. Perhaps this test will simply work for both DG2 and PVC

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tried to run it on dg2 and it gets stuck. There is probably another version of load_2d for dg2 or other platforms but this one appears to be for PVC only

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The untyped load/store/prefetch 2d block operations are not supported by DG2/MTL/ARL. They are only available for PVC and Xe2+.

// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

// The tests makes sure old and new load_2d API produce identical
// results.
#include <iostream>
#include <sycl/ext/intel/esimd.hpp>
#include <sycl/sycl.hpp>

using bf16 = sycl::ext::oneapi::bfloat16;
using namespace sycl;
using namespace sycl::ext::intel::esimd;
using namespace sycl::ext::intel::experimental::esimd;
template <typename T> bool test() {
sycl::queue Q(sycl::gpu_selector_v);
auto dev = Q.get_device();
std::cout << "Running on " << dev.get_info<sycl::info::device::name>()
<< "\n";

constexpr int TM = 8;
constexpr int TN = 8;
constexpr int NBLOCKS = 2;
constexpr int WIDTH = 2 * TN;
constexpr int HEIGHT = TM;
constexpr int PITCH = WIDTH;
constexpr int SIZE = WIDTH * HEIGHT;

auto *A = malloc_shared<T>(SIZE, Q);
auto *B = malloc_shared<T>(SIZE, Q);
auto *C = malloc_shared<T>(SIZE, Q);
auto *C1 = malloc_shared<T>(SIZE, Q);

for (int i = 0; i < SIZE; i++) {
A[i] = static_cast<T>(i);
}

Q.parallel_for(sycl::nd_range<1>(1, 1), [=](sycl::nd_item<1>
item) SYCL_ESIMD_KERNEL {
config_2d_mem_access<T, TN, TM, NBLOCKS> my_config(
A, WIDTH * sizeof(T) - 1, HEIGHT - 1, PITCH * sizeof(T) - 1, 0, 0);

simd<T, NBLOCKS * TM * TN> tmp =
lsc_load_2d<T, TN, TM, NBLOCKS, false, false>(my_config);
simd<T, NBLOCKS * TM * TN> tmp1 = lsc_load_2d<T, TN, TM, NBLOCKS>(
my_config.get_data_pointer(), my_config.get_surface_width(),
my_config.get_surface_height(), my_config.get_surface_pitch(),
my_config.get_x(), my_config.get_y());

tmp.copy_to(C);
tmp1.copy_to(C1);
}).wait();

bool error = false;
for (auto i = 0; i < SIZE; ++i)
error |= C[i] != C1[i];

free(A, Q);
free(C, Q);
free(C1, Q);
return error;
}

int main() {
bool result = false;
result |= test<float>();
result |= test<uint32_t>();
result |= test<uint16_t>();
result |= test<uint8_t>();
result |= test<sycl::half>();

std::cout << (result ? "FAILED" : "passed") << std::endl;
return 0;
}
Loading