Skip to content

Commit

Permalink
[ESIMD] Conditionally revert the recent changes for slm_gather() (#12341
Browse files Browse the repository at this point in the history
)

The previous fix (#12316) added usage
of llvm.masked.gather for slm_gather(). Such usage does not work with
current GPU drivers if ESIMD function is called via invoke_simd() API.

The fix here returns the previous lowering to GenX instead of
llvm.masked.gather.
Using the lowering to llvm.masked.gather can be used if define
__ESIMD_GATHER_SCATTER_LLVM_IR macro (turned off by default).

Signed-off-by: Klochkov, Vyacheslav N <vyacheslav.n.klochkov@intel.com>
  • Loading branch information
v-klochkov authored Jan 10, 2024
1 parent c7549f9 commit 5e9564d
Show file tree
Hide file tree
Showing 2 changed files with 165 additions and 3 deletions.
13 changes: 10 additions & 3 deletions sycl/include/sycl/ext/intel/esimd/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3146,9 +3146,16 @@ slm_gather(simd<uint32_t, N> byte_offsets, simd_mask<N> mask,
detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
static_assert(Alignment >= sizeof(T),
"slm_gather() requires at least element-size alignment");
simd<MsgT, N> PassThru; // it is intentionally undefined
return __esimd_slm_gather_ld<MsgT, N, Alignment>(
byte_offsets.data(), mask.data(), PassThru.data());
if constexpr (detail::isMaskedGatherScatterLLVMAvailable()) {
simd<MsgT, N> PassThru; // it is intentionally undefined
return __esimd_slm_gather_ld<MsgT, N, Alignment>(
byte_offsets.data(), mask.data(), PassThru.data());
} else {
static_assert(N == 1 || N == 8 || N == 16 || N == 32,
"Unsupported vector length");
detail::LocalAccessorMarker acc;
return detail::gather_impl<T, N>(acc, byte_offsets, 0, mask);
}
}

template <typename T, int N,
Expand Down
155 changes: 155 additions & 0 deletions sycl/test-e2e/InvokeSimd/Regression/slm_gather_scatter.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
// GPU driver had an error in handling of SLM aligned block_loads/stores,
// which has been fixed only in "1.3.26816", and in win/opencl version going
// _after_ 101.4575.
// REQUIRES-INTEL-DRIVER: lin: 26816, win: 101.4576
//
// RUN: %{build} -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr -o %t.out
// RUN: env IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t.out
//
// VISALTO enable run
// RUN: env IGC_VISALTO=63 IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t.out

/*
* Test check basic support of local memory access in invoke_simd.
*/

#include "../invoke_simd_utils.hpp"

#include <sycl/ext/intel/esimd.hpp>
#include <sycl/ext/oneapi/experimental/invoke_simd.hpp>
#include <sycl/sycl.hpp>

#include <functional>
#include <iostream>
#include <type_traits>

// TODO: When gpu driver can pass/accept accessor by value,
// the work-around undef #ifdef US_ACC_PTR should be removed.
#define USE_ACC_PTR

/* Subgroup size attribute is optional
* In case it is absent compiler decides what subgroup size to use
*/
#ifdef IMPL_SUBGROUP
#define SUBGROUP_ATTR
#else
#define SUBGROUP_ATTR [[intel::reqd_sub_group_size(VL)]]
#endif

using namespace sycl;
using namespace sycl::ext::oneapi::experimental;
namespace esimd = sycl::ext::intel::esimd;

using dtype = int;

constexpr int VL = 16;
constexpr uint32_t LocalRange = VL * 2; // 2 sub-groups per 1 group.
constexpr uint32_t GlobalRange = LocalRange * 2; // 2 groups.

ESIMD_INLINE void slm_load_store_test(
local_accessor<dtype, 1> LocalAcc, uint32_t LAByteOffset, dtype *A,
dtype *C, esimd::simd<uint32_t, VL> GlobalByteOffsets) SYCL_ESIMD_FUNCTION {

uint32_t LocalAccOffset =
static_cast<uint32_t>(
reinterpret_cast<std::uintptr_t>(LocalAcc.get_pointer().get())) +
LAByteOffset;
esimd::simd<uint32_t, VL> Offsets(LocalAccOffset, sizeof(dtype));
auto Local1 = esimd::slm_gather<dtype, VL>(Offsets);
Offsets += static_cast<uint32_t>(LocalRange * sizeof(dtype));
auto Local2 = esimd::slm_gather<dtype, VL>(Offsets);

auto Global = esimd::gather(A, GlobalByteOffsets);
auto Res = Global + Local1 + Local2;
esimd::slm_scatter(Offsets, Res);
}

[[intel::device_indirectly_callable]] SYCL_EXTERNAL void __regcall invoke_slm_load_store_test(
#ifdef USE_ACC_PTR
local_accessor<dtype, 1> *LocalAcc,
#else
local_accessor<dtype, 1> LocalAcc,
#endif
uint32_t SLMByteOffset, dtype *A, dtype *C,
simd<uint32_t, VL> GlobalByteOffsets) SYCL_ESIMD_FUNCTION {
#ifdef USE_ACC_PTR
slm_load_store_test(*LocalAcc, SLMByteOffset, A, C, GlobalByteOffsets);
#else
slm_load_store_test(LocalAcc, SLMByteOffset, A, C, GlobalByteOffsets);
#endif
}

int main(void) {
auto Q = queue{gpu_selector_v};
auto Dev = Q.get_device();
std::cout << "Running on " << Dev.get_info<sycl::info::device::name>()
<< std::endl;

auto DeviceSLMSize = Dev.get_info<sycl::info::device::local_mem_size>();
std::cout << "Local Memory Size: " << DeviceSLMSize << std::endl;

sycl::nd_range<1> NDRange{range<1>{GlobalRange}, range<1>{LocalRange}};

// The test is going to use (LocalRange * 2) elements of dtype type.
if (DeviceSLMSize < LocalRange * 2 * sizeof(dtype)) {
// Report an error - the test needs a fix.
std::cerr << "Error: Test needs more SLM memory than device has"
<< std::endl;
return 1;
}

auto *A = malloc_shared<dtype>(GlobalRange, Q);
auto *C = malloc_shared<dtype>(GlobalRange, Q);

for (auto i = 0; i < GlobalRange; i++) {
A[i] = i;
C[i] = 0;
}
try {
Q.submit([&](handler &CGH) {
auto LocalAcc = local_accessor<dtype, 1>(LocalRange * 2, CGH);
CGH.parallel_for(NDRange, [=](nd_item<1> Item) SUBGROUP_ATTR {
uint32_t GlobalId = Item.get_global_id(0);
uint32_t LocalId = Item.get_local_id(0);
auto LocalAccCopy = LocalAcc;
LocalAccCopy[LocalId] = GlobalId * 100;
LocalAccCopy[LocalId + LocalRange] = GlobalId * 10000;
Item.barrier();

uint32_t LAByteOffset = (LocalId / VL) * VL * sizeof(dtype);
uint32_t GlobalByteOffset = GlobalId * sizeof(dtype);
sycl::sub_group SG = Item.get_sub_group();
#ifdef USE_ACC_PTR
auto LocalAccArg = uniform{&LocalAccCopy};
#else
auto LocalAccArg = uniform{LocalAccCopy};
#endif
invoke_simd(SG, invoke_slm_load_store_test, LocalAccArg,
uniform{LAByteOffset}, uniform{A}, uniform{C},
GlobalByteOffset);
C[GlobalId] = LocalAccCopy[LocalId + LocalRange];
});
}).wait();
} catch (sycl::exception const &e) {
std::cout << "SYCL exception caught: " << e.what() << '\n';
free(A, Q);
free(C, Q);
return e.code().value();
}

bool Pass = true;
for (auto i = 0; i < GlobalRange; i++) {
dtype Expected = A[i] + i * (10000 + 100);
if (C[i] != Expected) {
std::cout << "Error: C[" << i << "]:" << C[i]
<< " != [expected]:" << Expected << std::endl;
Pass = false;
}
}

free(A, Q);
free(C, Q);

std::cout << "Test result: " << (Pass ? "Pass" : "Fail") << std::endl;
return Pass ? 0 : 1;
}

0 comments on commit 5e9564d

Please sign in to comment.