From e706bfc91e390c0d40226dadcc31a5dfb8a4f299 Mon Sep 17 00:00:00 2001 From: Gregory Fine Date: Sun, 24 Dec 2023 16:14:11 -0800 Subject: [PATCH 1/5] Fix lsc_load_2d API issue that prevented usage for different types --- .../ext/intel/experimental/esimd/memory.hpp | 53 ++++++++---- .../ESIMD/lsc/lsc_load_2d_compare.cpp | 81 +++++++++++++++++++ .../ESIMD/lsc/lsc_load_store_2d_compare.cpp | 77 ++++++++++-------- 3 files changed, 165 insertions(+), 46 deletions(-) create mode 100644 sycl/test-e2e/ESIMD/lsc/lsc_load_2d_compare.cpp diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp index a7fe58326a87b..bcceb72842954 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp @@ -471,6 +471,21 @@ lsc_format_ret(__ESIMD_NS::simd Vals) { } } +template constexpr uint32_t get_lsc_data_size() { + switch (sizeof(T)) { + case 1: + return 0; + case 2: + return 1; + case 4: + return 2; + case 8: + return 3; + default: + static_assert(true, "Unsupported data type."); + } +} + template constexpr uint32_t get_lsc_load_cache_mask() { if constexpr (L1H == cache_hint::read_invalidate && @@ -1984,14 +1999,15 @@ constexpr void check_lsc_block_2d_restrictions() { /// N = roundUpNextMultiple(BlockHeight, 4 / sizeof(T)) * /// getNextPowerOf2(BlockWidth) * NBlocks /// -template ()> -__ESIMD_API __ESIMD_NS::simd -lsc_load_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight, + Tx, NBlocks, BlockHeight, BlockWidth, Transposed, Transformed>()> +__ESIMD_API __ESIMD_NS::simd +lsc_load_2d(const Tx *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight, unsigned SurfacePitch, int X, int Y) { + using T = __ESIMD_DNS::__raw_t; detail::check_lsc_cache_hint(); detail::check_lsc_block_2d_restrictions()> -__ESIMD_API void lsc_store_2d(T *Ptr, unsigned SurfaceWidth, + Tx, 1u, BlockHeight, BlockWidth, false, false>()> +__ESIMD_API void lsc_store_2d(Tx *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight, unsigned SurfacePitch, int X, int Y, __ESIMD_NS::simd Vals) { + using T = __ESIMD_DNS::__raw_t; detail::check_lsc_cache_hint(); detail::check_lsc_block_2d_restrictions(); @@ -2428,17 +2445,23 @@ ESIMD_INLINE SYCL_ESIMD_FUNCTION __ESIMD_NS::simd lsc_load_2d( constexpr int DstBlockElements = GRFColSize * GRFRowSize; constexpr int DstElements = DstBlockElements * NBlocks; + constexpr uint32_t DstLength = DstBlockElements * sizeof(T) / 32; + constexpr uint32_t DstLengthMask = DstLength == 0 ? 1 << 20 + : DstLength == 32 ? 31 << 20 + : DstLength << 20; + static_assert(N == ActualN || N == DstElements, "Incorrect element count"); constexpr uint32_t cache_mask = detail::get_lsc_load_cache_mask() << 17; - constexpr uint32_t base_desc = 0x2800403; + constexpr uint32_t base_desc = 0x2000003; constexpr uint32_t transformMask = Transformed ? 1 << 7 : 0; constexpr uint32_t transposeMask = Transposed ? 1 << 15 : 0; + constexpr uint32_t dataSizeMask = detail::get_lsc_data_size() << 9; __ESIMD_NS::simd oldDst; constexpr uint32_t exDesc = 0x0; - constexpr uint32_t desc = - base_desc | cache_mask | transformMask | transposeMask; + constexpr uint32_t desc = base_desc | cache_mask | transformMask | + transposeMask | dataSizeMask | DstLengthMask; constexpr uint8_t execSize = 1; constexpr uint8_t sfid = 0xF; constexpr uint8_t numSrc0 = 0x1; @@ -2500,12 +2523,13 @@ ESIMD_INLINE SYCL_ESIMD_FUNCTION void lsc_prefetch_2d( "Transposed and transformed is not supported"); constexpr uint32_t cache_mask = detail::get_lsc_load_cache_mask() << 17; - constexpr uint32_t base_desc = 0x2000403; + constexpr uint32_t dataSizeMask = detail::get_lsc_data_size() << 9; + constexpr uint32_t base_desc = 0x2000003; constexpr uint32_t transformMask = Transformed ? 1 << 7 : 0; constexpr uint32_t transposeMask = Transposed ? 1 << 15 : 0; constexpr uint32_t exDesc = 0x0; constexpr uint32_t desc = - base_desc | cache_mask | transformMask | transposeMask; + base_desc | cache_mask | transformMask | transposeMask | dataSizeMask; constexpr uint8_t execSize = 1; constexpr uint8_t sfid = 0xF; constexpr uint8_t numDst = (N * sizeof(T)) / 64; @@ -2542,10 +2566,11 @@ lsc_store_2d(config_2d_mem_access &payload, constexpr uint32_t cache_mask = detail::get_lsc_store_cache_mask() << 17; - constexpr uint32_t base_desc = 0x2000407; + constexpr uint32_t dataSizeMask = detail::get_lsc_data_size() << 9; + constexpr uint32_t base_desc = 0x2000007; constexpr uint32_t exDesc = 0x0; - constexpr uint32_t desc = base_desc | cache_mask; + constexpr uint32_t desc = base_desc | cache_mask | dataSizeMask; constexpr uint8_t execSize = 1; constexpr uint8_t sfid = 0xF; constexpr uint8_t numSrc0 = 0x1; diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_load_2d_compare.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_load_2d_compare.cpp new file mode 100644 index 0000000000000..ce8c443a4b126 --- /dev/null +++ b/sycl/test-e2e/ESIMD/lsc/lsc_load_2d_compare.cpp @@ -0,0 +1,81 @@ +//==----- lsc_load_2d_compare.cpp - DPC++ ESIMD on-device test ------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: gpu-intel-pvc +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// The tests makes sure old and new load_2d API produce identical +// results. +#include +#include +#include + +using bf16 = sycl::ext::oneapi::bfloat16; +using namespace sycl; +using namespace sycl::ext::intel::esimd; +using namespace sycl::ext::intel::experimental::esimd; +template bool test() { + sycl::queue Q(sycl::gpu_selector_v); + auto dev = Q.get_device(); + std::cout << "Running on " << dev.get_info() + << "\n"; + + constexpr int TM = 8; + constexpr int TN = 8; + constexpr int NBLOCKS = 2; + constexpr int WIDTH = 2 * TN; + constexpr int HEIGHT = TM; + constexpr int PITCH = WIDTH; + constexpr int SIZE = WIDTH * HEIGHT; + + auto *A = malloc_shared(SIZE, Q); + auto *B = malloc_shared(SIZE, Q); + auto *C = malloc_shared(SIZE, Q); + auto *C1 = malloc_shared(SIZE, Q); + + for (int i = 0; i < SIZE; i++) { + A[i] = static_cast(i); + } + + Q.parallel_for(sycl::nd_range<1>(1, 1), [=](sycl::nd_item<1> + item) SYCL_ESIMD_KERNEL { + config_2d_mem_access my_config( + A, WIDTH * sizeof(T) - 1, HEIGHT - 1, PITCH * sizeof(T) - 1, 0, 0); + + simd tmp = + lsc_load_2d(my_config); + simd tmp1 = lsc_load_2d( + my_config.get_data_pointer(), my_config.get_surface_width(), + my_config.get_surface_height(), my_config.get_surface_pitch(), + my_config.get_x(), my_config.get_y()); + + tmp.copy_to(C); + tmp1.copy_to(C1); + }).wait(); + + bool error = false; + for (auto i = 0; i < SIZE; ++i) + error |= C[i] != C1[i]; + + free(A, Q); + free(C, Q); + free(C1, Q); + return error; +} + +int main() { + bool result = false; + result |= test(); + result |= test(); + result |= test(); + result |= test(); + result |= test(); + + std::cout << (result ? "FAILED" : "passed") << std::endl; + return 0; +} diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_load_store_2d_compare.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_load_store_2d_compare.cpp index 9f18b9081df46..2ae0dd567e3f7 100644 --- a/sycl/test-e2e/ESIMD/lsc/lsc_load_store_2d_compare.cpp +++ b/sycl/test-e2e/ESIMD/lsc/lsc_load_store_2d_compare.cpp @@ -20,67 +20,67 @@ using namespace sycl; using namespace sycl::ext::intel::esimd; using namespace sycl::ext::intel::experimental::esimd; -int main() { - constexpr uint32_t BlockWidth = 16; - constexpr uint32_t BlockHeight = 4; +template bool test() { + constexpr uint32_t BlockWidth = 8; + constexpr uint32_t BlockHeight = 8; constexpr uint32_t NumBlocks = 1; - constexpr uint32_t SurfaceHeight = 16; - constexpr uint32_t SurfaceWidth = 16; - constexpr uint32_t SurfacePitch = 16; + constexpr uint32_t SurfaceHeight = BlockHeight; + constexpr uint32_t SurfaceWidth = BlockWidth * NumBlocks; + constexpr uint32_t SurfacePitch = BlockWidth; constexpr uint32_t x = 0; constexpr uint32_t y = 0; - constexpr uint32_t Size = SurfacePitch * SurfaceHeight * NumBlocks; + constexpr uint32_t Size = SurfaceWidth * SurfaceHeight; queue q; auto dev = q.get_device(); std::cout << "Running on " << dev.get_info() << "\n"; - auto *A = malloc_shared(Size, q); - auto *B = malloc_shared(Size, q); - auto *C = malloc_shared(Size, q); - auto *C1 = malloc_shared(Size, q); + auto *A = malloc_shared(Size, q); + auto *B = malloc_shared(Size, q); + auto *C = malloc_shared(Size, q); + auto *C1 = malloc_shared(Size, q); for (auto i = 0; i != Size; i++) { - A[i] = B[i] = 7; + A[i] = B[i] = i; C[i] = C1[i] = 0; } auto e = q.submit([&](handler &cgh) { - cgh.parallel_for(range<1>{1}, [=](id<1> i) SYCL_ESIMD_KERNEL { - constexpr uint32_t width = SurfaceWidth * sizeof(float) - 1; + cgh.parallel_for(range<1>{1}, [=](id<1> i) SYCL_ESIMD_KERNEL { + constexpr uint32_t width = SurfaceWidth * sizeof(T) - 1; constexpr uint32_t height = SurfaceHeight - 1; - constexpr uint32_t pitch = SurfacePitch * sizeof(float) - 1; - auto data_a = lsc_load_2d( + constexpr uint32_t pitch = SurfacePitch * sizeof(T) - 1; + simd data_a = lsc_load_2d( A, width, height, pitch, x, y); - auto data_b = lsc_load_2d( + simd data_b = lsc_load_2d( B, width, height, pitch, x, y); - auto data_c = data_a + data_b; + simd data_c = data_a + data_b; - lsc_store_2d(C, width, height, pitch, x, - y, data_c); + lsc_store_2d(C, width, height, pitch, x, y, + data_c); }); }); e.wait(); auto e1 = q.submit([&](handler &cgh) { - cgh.parallel_for(range<1>{1}, [=](id<1> i) SYCL_ESIMD_KERNEL { - constexpr uint32_t width = SurfaceWidth * sizeof(float) - 1; + cgh.parallel_for(range<1>{1}, [=](id<1> i) SYCL_ESIMD_KERNEL { + constexpr uint32_t width = SurfaceWidth * sizeof(T) - 1; constexpr uint32_t height = SurfaceHeight - 1; - constexpr uint32_t pitch = SurfacePitch * sizeof(float) - 1; + constexpr uint32_t pitch = SurfacePitch * sizeof(T) - 1; - config_2d_mem_access payload( + config_2d_mem_access payload( A, width, height, pitch, 0, 0); - lsc_prefetch_2d(payload); - auto data_a = lsc_load_2d(payload); + simd data_a = lsc_load_2d(payload); payload.set_data_pointer(B); - lsc_prefetch_2d(payload); - auto data_b = lsc_load_2d(payload); + simd data_b = lsc_load_2d(payload); - auto data_c = data_a + data_b; + simd data_c = data_a + data_b; payload.set_data_pointer(C1); @@ -89,14 +89,27 @@ int main() { }); e1.wait(); - auto error = 0; + bool error = false; for (auto i = 0; i < Size; ++i) - error += std::abs(C[i] - C1[i]); + error |= C[i] != C1[i]; free(A, q); free(B, q); free(C, q); free(C1, q); - std::cout << (error != 0 ? "FAILED" : "passed") << std::endl; + return error; +} + +int main() { + bool result = false; + result |= test(); + result |= test(); + result |= test(); + result |= test(); + result |= test(); + result |= test(); + result |= test(); + + std::cout << (result ? "FAILED" : "passed") << std::endl; return 0; } From 2170b47b439b8537e7e54afeed51ed6d2ec2e9b6 Mon Sep 17 00:00:00 2001 From: Gregory Fine Date: Sun, 24 Dec 2023 16:29:28 -0800 Subject: [PATCH 2/5] Fix a build issue --- sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp index bcceb72842954..e57a65d4c5e12 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp @@ -2161,7 +2161,7 @@ template ()> __ESIMD_API void lsc_store_2d(Tx *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight, unsigned SurfacePitch, - int X, int Y, __ESIMD_NS::simd Vals) { + int X, int Y, __ESIMD_NS::simd Vals) { using T = __ESIMD_DNS::__raw_t; detail::check_lsc_cache_hint(); detail::check_lsc_block_2d_restrictions Date: Mon, 25 Dec 2023 20:58:41 -0800 Subject: [PATCH 3/5] Address PR comments --- .../ext/intel/experimental/esimd/memory.hpp | 58 ++++++++++--------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp index e57a65d4c5e12..c79c0e30c7e30 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp @@ -1999,25 +1999,25 @@ constexpr void check_lsc_block_2d_restrictions() { /// N = roundUpNextMultiple(BlockHeight, 4 / sizeof(T)) * /// getNextPowerOf2(BlockWidth) * NBlocks /// -template ()> -__ESIMD_API __ESIMD_NS::simd -lsc_load_2d(const Tx *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight, + T, NBlocks, BlockHeight, BlockWidth, Transposed, Transformed>()> +__ESIMD_API __ESIMD_NS::simd +lsc_load_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight, unsigned SurfacePitch, int X, int Y) { - using T = __ESIMD_DNS::__raw_t; + using RawT = __ESIMD_DNS::__raw_t; detail::check_lsc_cache_hint(); - detail::check_lsc_block_2d_restrictions(); // For Load BlockWidth is padded up to the next power-of-two value. // For Load with Transpose the pre-operation BlockHeight is padded up // to the next power-of-two value. // For Load with Transform pre-operation BlockHeight is padded up to // multiple of K, where K = 4B / sizeof(T). - constexpr int ElemsPerDword = 4 / sizeof(T); + constexpr int ElemsPerDword = 4 / sizeof(RawT); constexpr int GRFRowSize = Transposed ? BlockHeight : Transformed ? BlockWidth * ElemsPerDword : BlockWidth; @@ -2029,7 +2029,7 @@ lsc_load_2d(const Tx *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight, : BlockHeight); constexpr int GRFBlockSize = GRFRowPitch * GRFColSize; constexpr int GRFBlockPitch = - detail::roundUpNextMultiple<64 / sizeof(T), GRFBlockSize>(); + detail::roundUpNextMultiple<64 / sizeof(RawT), GRFBlockSize>(); constexpr int ActualN = NBlocks * GRFBlockPitch; constexpr int DstBlockElements = GRFColSize * GRFRowSize; @@ -2038,14 +2038,14 @@ lsc_load_2d(const Tx *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight, static_assert(N == ActualN || N == DstElements, "Incorrect element count"); constexpr lsc_data_size DS = - detail::finalize_data_size(); + detail::finalize_data_size(); __ESIMD_NS::simd_mask pred = 1; uintptr_t surf_addr = reinterpret_cast(Ptr); constexpr detail::lsc_data_order _Transposed = Transposed ? detail::lsc_data_order::transpose : detail::lsc_data_order::nontranspose; - __ESIMD_NS::simd Raw = - __esimd_lsc_load2d_stateless Raw = + __esimd_lsc_load2d_stateless(pred.data(), surf_addr, SurfaceWidth, SurfaceHeight, @@ -2071,16 +2071,17 @@ lsc_load_2d(const Tx *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight, // +----+----+----+----+----+----+-----+-----+ // * signifies the padded element. - __ESIMD_NS::simd Dst; + __ESIMD_NS::simd Dst; for (auto i = 0; i < NBlocks; i++) { auto DstBlock = Dst.template select(i * DstBlockElements); auto RawBlock = Raw.template select(i * GRFBlockPitch); - DstBlock = RawBlock.template bit_cast_view() - .template select(0, 0) - .template bit_cast_view(); + DstBlock = + RawBlock.template bit_cast_view() + .template select(0, 0) + .template bit_cast_view(); } return Dst; @@ -2155,38 +2156,39 @@ __ESIMD_API void lsc_prefetch_2d(const T *Ptr, unsigned SurfaceWidth, /// N = roundUpNextMultiple(BlockHeight, 4 / sizeof(T)) * /// getNextPowerOf2(BlockWidth) * NBlocks /// -template ()> -__ESIMD_API void lsc_store_2d(Tx *Ptr, unsigned SurfaceWidth, + T, 1u, BlockHeight, BlockWidth, false, false>()> +__ESIMD_API void lsc_store_2d(T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight, unsigned SurfacePitch, - int X, int Y, __ESIMD_NS::simd Vals) { - using T = __ESIMD_DNS::__raw_t; + int X, int Y, __ESIMD_NS::simd Vals) { + using RawT = __ESIMD_DNS::__raw_t; detail::check_lsc_cache_hint(); - detail::check_lsc_block_2d_restrictions(); + detail::check_lsc_block_2d_restrictions(); constexpr lsc_data_size DS = - detail::finalize_data_size(); + detail::finalize_data_size(); uintptr_t surf_addr = reinterpret_cast(Ptr); constexpr detail::lsc_data_order _Transposed = detail::lsc_data_order::nontranspose; constexpr int Pitch = __ESIMD_DNS::getNextPowerOf2(); - __ESIMD_NS::simd Raw; + __ESIMD_NS::simd Raw; if constexpr (BlockHeight * Pitch == N) { Raw = Vals; } else { // For store with padding, allocate the block with padding, and place // original data there. - auto Data2D = Vals.template bit_cast_view(); - auto Raw2D = Raw.template bit_cast_view(); + auto Data2D = Vals.template bit_cast_view(); + auto Raw2D = Raw.template bit_cast_view(); Raw2D.template select(0, 0) = Data2D; } __ESIMD_NS::simd_mask pred = 1; - __esimd_lsc_store2d_stateless( pred.data(), surf_addr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y, Raw.data()); From 99537becb171f4c87eae162b7c69a143acbe97a4 Mon Sep 17 00:00:00 2001 From: "Fine, Gregory" Date: Tue, 26 Dec 2023 15:11:44 -0800 Subject: [PATCH 4/5] Address PR comments --- .../include/sycl/ext/intel/experimental/esimd/memory.hpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp index c79c0e30c7e30..8a8822fe3e50f 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp @@ -2447,10 +2447,11 @@ ESIMD_INLINE SYCL_ESIMD_FUNCTION __ESIMD_NS::simd lsc_load_2d( constexpr int DstBlockElements = GRFColSize * GRFRowSize; constexpr int DstElements = DstBlockElements * NBlocks; - constexpr uint32_t DstLength = DstBlockElements * sizeof(T) / 32; - constexpr uint32_t DstLengthMask = DstLength == 0 ? 1 << 20 - : DstLength == 32 ? 31 << 20 - : DstLength << 20; + constexpr uint32_t GrfBytes = 64; + constexpr uint32_t DstBlockSize = + detail::roundUpNextMultiple(); + constexpr uint32_t DstLength = DstBlockSize > 31 ? 31 : DstBlockSize; + constexpr uint32_t DstLengthMask = DstLength << 20; static_assert(N == ActualN || N == DstElements, "Incorrect element count"); From d23d21c04a1f5b6745399807c846695559037d3e Mon Sep 17 00:00:00 2001 From: "Fine, Gregory" Date: Sat, 30 Dec 2023 18:23:23 -0800 Subject: [PATCH 5/5] Address PR comments --- sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp index 8a8822fe3e50f..581626c9b3d64 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp @@ -2449,8 +2449,9 @@ ESIMD_INLINE SYCL_ESIMD_FUNCTION __ESIMD_NS::simd lsc_load_2d( constexpr uint32_t GrfBytes = 64; constexpr uint32_t DstBlockSize = - detail::roundUpNextMultiple(); - constexpr uint32_t DstLength = DstBlockSize > 31 ? 31 : DstBlockSize; + detail::roundUpNextMultiple(); + constexpr uint32_t DstLength = + (DstBlockSize / GrfBytes) > 31 ? 31 : (DstBlockSize / GrfBytes); constexpr uint32_t DstLengthMask = DstLength << 20; static_assert(N == ActualN || N == DstElements, "Incorrect element count");