From 5e9564d597736b0b41294b8c75a0482da57e957f Mon Sep 17 00:00:00 2001 From: Vyacheslav Klochkov Date: Wed, 10 Jan 2024 10:09:57 -0600 Subject: [PATCH] [ESIMD] Conditionally revert the recent changes for slm_gather() (#12341) The previous fix (https://github.com/intel/llvm/pull/12316) added usage of llvm.masked.gather for slm_gather(). Such usage does not work with current GPU drivers if ESIMD function is called via invoke_simd() API. The fix here returns the previous lowering to GenX instead of llvm.masked.gather. Using the lowering to llvm.masked.gather can be used if define __ESIMD_GATHER_SCATTER_LLVM_IR macro (turned off by default). Signed-off-by: Klochkov, Vyacheslav N --- sycl/include/sycl/ext/intel/esimd/memory.hpp | 13 +- .../Regression/slm_gather_scatter.cpp | 155 ++++++++++++++++++ 2 files changed, 165 insertions(+), 3 deletions(-) create mode 100644 sycl/test-e2e/InvokeSimd/Regression/slm_gather_scatter.cpp diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp index b9a105c4298a7..6203948f50c45 100644 --- a/sycl/include/sycl/ext/intel/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp @@ -3146,9 +3146,16 @@ slm_gather(simd byte_offsets, simd_mask mask, detail::getPropertyValue(sizeof(T)); static_assert(Alignment >= sizeof(T), "slm_gather() requires at least element-size alignment"); - simd PassThru; // it is intentionally undefined - return __esimd_slm_gather_ld( - byte_offsets.data(), mask.data(), PassThru.data()); + if constexpr (detail::isMaskedGatherScatterLLVMAvailable()) { + simd PassThru; // it is intentionally undefined + return __esimd_slm_gather_ld( + byte_offsets.data(), mask.data(), PassThru.data()); + } else { + static_assert(N == 1 || N == 8 || N == 16 || N == 32, + "Unsupported vector length"); + detail::LocalAccessorMarker acc; + return detail::gather_impl(acc, byte_offsets, 0, mask); + } } template +#include +#include + +#include +#include +#include + +// TODO: When gpu driver can pass/accept accessor by value, +// the work-around undef #ifdef US_ACC_PTR should be removed. +#define USE_ACC_PTR + +/* Subgroup size attribute is optional + * In case it is absent compiler decides what subgroup size to use + */ +#ifdef IMPL_SUBGROUP +#define SUBGROUP_ATTR +#else +#define SUBGROUP_ATTR [[intel::reqd_sub_group_size(VL)]] +#endif + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental; +namespace esimd = sycl::ext::intel::esimd; + +using dtype = int; + +constexpr int VL = 16; +constexpr uint32_t LocalRange = VL * 2; // 2 sub-groups per 1 group. +constexpr uint32_t GlobalRange = LocalRange * 2; // 2 groups. + +ESIMD_INLINE void slm_load_store_test( + local_accessor LocalAcc, uint32_t LAByteOffset, dtype *A, + dtype *C, esimd::simd GlobalByteOffsets) SYCL_ESIMD_FUNCTION { + + uint32_t LocalAccOffset = + static_cast( + reinterpret_cast(LocalAcc.get_pointer().get())) + + LAByteOffset; + esimd::simd Offsets(LocalAccOffset, sizeof(dtype)); + auto Local1 = esimd::slm_gather(Offsets); + Offsets += static_cast(LocalRange * sizeof(dtype)); + auto Local2 = esimd::slm_gather(Offsets); + + auto Global = esimd::gather(A, GlobalByteOffsets); + auto Res = Global + Local1 + Local2; + esimd::slm_scatter(Offsets, Res); +} + +[[intel::device_indirectly_callable]] SYCL_EXTERNAL void __regcall invoke_slm_load_store_test( +#ifdef USE_ACC_PTR + local_accessor *LocalAcc, +#else + local_accessor LocalAcc, +#endif + uint32_t SLMByteOffset, dtype *A, dtype *C, + simd GlobalByteOffsets) SYCL_ESIMD_FUNCTION { +#ifdef USE_ACC_PTR + slm_load_store_test(*LocalAcc, SLMByteOffset, A, C, GlobalByteOffsets); +#else + slm_load_store_test(LocalAcc, SLMByteOffset, A, C, GlobalByteOffsets); +#endif +} + +int main(void) { + auto Q = queue{gpu_selector_v}; + auto Dev = Q.get_device(); + std::cout << "Running on " << Dev.get_info() + << std::endl; + + auto DeviceSLMSize = Dev.get_info(); + std::cout << "Local Memory Size: " << DeviceSLMSize << std::endl; + + sycl::nd_range<1> NDRange{range<1>{GlobalRange}, range<1>{LocalRange}}; + + // The test is going to use (LocalRange * 2) elements of dtype type. + if (DeviceSLMSize < LocalRange * 2 * sizeof(dtype)) { + // Report an error - the test needs a fix. + std::cerr << "Error: Test needs more SLM memory than device has" + << std::endl; + return 1; + } + + auto *A = malloc_shared(GlobalRange, Q); + auto *C = malloc_shared(GlobalRange, Q); + + for (auto i = 0; i < GlobalRange; i++) { + A[i] = i; + C[i] = 0; + } + try { + Q.submit([&](handler &CGH) { + auto LocalAcc = local_accessor(LocalRange * 2, CGH); + CGH.parallel_for(NDRange, [=](nd_item<1> Item) SUBGROUP_ATTR { + uint32_t GlobalId = Item.get_global_id(0); + uint32_t LocalId = Item.get_local_id(0); + auto LocalAccCopy = LocalAcc; + LocalAccCopy[LocalId] = GlobalId * 100; + LocalAccCopy[LocalId + LocalRange] = GlobalId * 10000; + Item.barrier(); + + uint32_t LAByteOffset = (LocalId / VL) * VL * sizeof(dtype); + uint32_t GlobalByteOffset = GlobalId * sizeof(dtype); + sycl::sub_group SG = Item.get_sub_group(); +#ifdef USE_ACC_PTR + auto LocalAccArg = uniform{&LocalAccCopy}; +#else + auto LocalAccArg = uniform{LocalAccCopy}; +#endif + invoke_simd(SG, invoke_slm_load_store_test, LocalAccArg, + uniform{LAByteOffset}, uniform{A}, uniform{C}, + GlobalByteOffset); + C[GlobalId] = LocalAccCopy[LocalId + LocalRange]; + }); + }).wait(); + } catch (sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + free(A, Q); + free(C, Q); + return e.code().value(); + } + + bool Pass = true; + for (auto i = 0; i < GlobalRange; i++) { + dtype Expected = A[i] + i * (10000 + 100); + if (C[i] != Expected) { + std::cout << "Error: C[" << i << "]:" << C[i] + << " != [expected]:" << Expected << std::endl; + Pass = false; + } + } + + free(A, Q); + free(C, Q); + + std::cout << "Test result: " << (Pass ? "Pass" : "Fail") << std::endl; + return Pass ? 0 : 1; +}