Skip to content

Commit

Permalink
[ESIMD] Implement property-based gather(usm, ...) (#12316)
Browse files Browse the repository at this point in the history
This patch also supports gather and masked-gather of any length N if it
does not use L1/L2 hints of VS>1.

Additionally for gathers without L1/L2 vs VS>1 this patch replaces the
calls of GenX SVM gather calls with LLVM IR
if the macro __ESIMD_GATHER_SCATTER_LLVM_IR is defined by user. If it
not defined, then using masked gathers
with pass_thru operand requires DG2/PVC.

---------

Signed-off-by: Vyacheslav N Klochkov <vyacheslav.n.klochkov@intel.com>
Signed-off-by: Klochkov, Vyacheslav N <vyacheslav.n.klochkov@intel.com>
  • Loading branch information
v-klochkov authored Jan 9, 2024
1 parent e7910c3 commit 3eca2d4
Show file tree
Hide file tree
Showing 11 changed files with 1,094 additions and 136 deletions.
40 changes: 40 additions & 0 deletions llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/Demangle/Demangle.h"
#include "llvm/Demangle/ItaniumDemangle.h"
#include "llvm/GenXIntrinsics/GenXIntrinsics.h"
Expand Down Expand Up @@ -970,6 +971,38 @@ static void translateBlockStore(CallInst &CI, bool IsSLM) {
SI->setDebugLoc(CI.getDebugLoc());
}

static void translateGatherLoad(CallInst &CI, bool IsSLM) {
IRBuilder<> Builder(&CI);
constexpr int AlignmentTemplateArgIdx = 2;
APInt Val = parseTemplateArg(CI, AlignmentTemplateArgIdx,
ESIMDIntrinDesc::GenXArgConversion::TO_I64);
Align AlignValue(Val.getZExtValue());

auto OffsetsOp = CI.getArgOperand(0);
auto MaskOp = CI.getArgOperand(1);
auto PassThroughOp = CI.getArgOperand(2);
auto DataType = CI.getType();

// Convert the mask from <N x i16> to <N x i1>.
Value *Zero = ConstantInt::get(MaskOp->getType(), 0);
MaskOp = Builder.CreateICmp(ICmpInst::ICMP_NE, MaskOp, Zero);

// The address space may be 3-SLM, 1-global or private.
// At the moment of calling 'gather()' operation the pointer passed to it
// is already 4-generic. Thus, simply use 4-generic for global and private
// and let GPU BE deduce the actual address space from the use-def graph.
unsigned AS = IsSLM ? 3 : 4;
auto ElemType = DataType->getScalarType();
auto NumElems = (cast<VectorType>(DataType))->getElementCount();
auto VPtrType = VectorType::get(PointerType::get(ElemType, AS), NumElems);
auto VPtrOp = Builder.CreateIntToPtr(OffsetsOp, VPtrType);

auto LI = Builder.CreateMaskedGather(DataType, VPtrOp, AlignValue, MaskOp,
PassThroughOp);
LI->setDebugLoc(CI.getDebugLoc());
CI.replaceAllUsesWith(LI);
}

// TODO Specify document behavior for slm_init and nbarrier_init when:
// 1) they are called not from kernels
// 2) there are multiple such calls reachable from a kernel
Expand Down Expand Up @@ -1910,6 +1943,13 @@ size_t SYCLLowerESIMDPass::runOnFunction(Function &F,
ToErase.push_back(CI);
continue;
}
if (Name.startswith("__esimd_gather_ld") ||
Name.startswith("__esimd_slm_gather_ld")) {
translateGatherLoad(*CI, Name.startswith("__esimd_slm_gather_ld"));
ToErase.push_back(CI);
continue;
}

if (Name.startswith("__esimd_nbarrier_init")) {
translateNbarrierInit(*CI);
ToErase.push_back(CI);
Expand Down
8 changes: 4 additions & 4 deletions sycl/include/sycl/ext/intel/esimd/detail/intrin.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,8 @@
//
template <typename T, int N, int M, int VStride, int Width, int Stride,
int ParentWidth = 0>
__ESIMD_INTRIN __ESIMD_INTRIN std::enable_if_t<(Width > 0) && M % Width == 0,
__ESIMD_DNS::vector_type_t<T, M>>
__ESIMD_INTRIN std::enable_if_t<(Width > 0) && M % Width == 0,
__ESIMD_DNS::vector_type_t<T, M>>
__esimd_rdregion(__ESIMD_DNS::vector_type_t<T, N> Input, uint16_t Offset);

template <typename T, int N, int M, int ParentWidth = 0>
Expand Down Expand Up @@ -263,8 +263,8 @@ __ESIMD_INTRIN uint16_t __esimd_all(__ESIMD_DNS::vector_type_t<T, N> src)
// Implementations of ESIMD intrinsics for the SYCL host device
template <typename T, int N, int M, int VStride, int Width, int Stride,
int ParentWidth>
__ESIMD_INTRIN __ESIMD_INTRIN std::enable_if_t<(Width > 0) && M % Width == 0,
__ESIMD_DNS::vector_type_t<T, M>>
__ESIMD_INTRIN std::enable_if_t<(Width > 0) && M % Width == 0,
__ESIMD_DNS::vector_type_t<T, M>>
__esimd_rdregion(__ESIMD_DNS::vector_type_t<T, N> Input, uint16_t Offset) {
uint16_t EltOffset = Offset / sizeof(T);
assert(Offset % sizeof(T) == 0);
Expand Down
20 changes: 18 additions & 2 deletions sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,20 @@ __esimd_lsc_load_slm(__ESIMD_DNS::simd_mask_storage_t<N> pred,
__ESIMD_DNS::vector_type_t<uint32_t, N> offsets)
__ESIMD_INTRIN_END;

// Gather data from the given global or private addresses.
template <typename T, int N, size_t Align>
__ESIMD_INTRIN __ESIMD_DNS::vector_type_t<T, N> __esimd_gather_ld(
__ESIMD_DNS::vector_type_t<uint64_t, N> vptr,
__ESIMD_DNS::simd_mask_storage_t<N> pred,
__ESIMD_DNS::vector_type_t<T, N> pass_thru) __ESIMD_INTRIN_END;

// Gather data from the given SLM addresses.
template <typename T, int N, size_t Align>
__ESIMD_INTRIN __ESIMD_DNS::vector_type_t<T, N> __esimd_slm_gather_ld(
__ESIMD_DNS::vector_type_t<uint32_t, N> vptr,
__ESIMD_DNS::simd_mask_storage_t<N> pred,
__ESIMD_DNS::vector_type_t<T, N> pass_thru) __ESIMD_INTRIN_END;

/// Surface-based gather.
/// Supported platforms: DG2, PVC
///
Expand All @@ -212,8 +226,10 @@ __esimd_lsc_load_slm(__ESIMD_DNS::simd_mask_storage_t<N> pred,
/// @tparam ImmOffset is the immediate offset added to each address.
/// @tparam DS is the data size.
/// @tparam VS is the number of elements to load per address.
/// @tparam Transposed indicates if the data is transposed during the transfer.
/// @tparam N is the SIMD size of operation (the number of addresses to access)
/// @tparam Transposed indicates if the data is transposed during the
/// transfer.
/// @tparam N is the SIMD size of operation (the number of addresses to
/// access)
/// @tparam SurfIndAliasT is the \ref sycl::accessor type.
/// @param pred is predicates.
/// @param offsets is the zero-based offsets in bytes.
Expand Down
Loading

0 comments on commit 3eca2d4

Please sign in to comment.