Skip to content

Commit 9617939

Browse files
authored
[SYCL][ESIMD]Implement slm_scatter accepting compile time properties (#12591)
This implements the unified memory API for slm_scatter with local memory
1 parent e53e5d5 commit 9617939

File tree

6 files changed

+539
-38
lines changed

6 files changed

+539
-38
lines changed

sycl/include/sycl/ext/intel/esimd/memory.hpp

Lines changed: 193 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -2685,11 +2685,13 @@ block_store(AccessorT acc, simd<T, N> vals, simd_mask<1> pred,
26852685
namespace detail {
26862686
template <typename T, int N, typename AccessorTy>
26872687
ESIMD_INLINE ESIMD_NODEBUG std::enable_if_t<
2688-
(sizeof(T) <= 4) && (N == 1 || N == 8 || N == 16 || N == 32) &&
2689-
(std::is_same_v<detail::LocalAccessorMarker, AccessorTy> ||
2690-
is_accessor_with_v<AccessorTy, detail::accessor_mode_cap::can_write>)>
2688+
std::is_same_v<detail::LocalAccessorMarker, AccessorTy> ||
2689+
is_accessor_with_v<AccessorTy, detail::accessor_mode_cap::can_write>>
26912690
scatter_impl(AccessorTy acc, simd<T, N> vals, simd<uint32_t, N> offsets,
26922691
uint32_t glob_offset, simd_mask<N> mask) {
2692+
2693+
static_assert(sizeof(T) <= 4 && detail::isPowerOf2(N, 32),
2694+
"Unexpected type or vector length");
26932695
constexpr int TypeSizeLog2 = detail::ElemsPerAddrEncoding<sizeof(T)>();
26942696
// TODO (performance) use hardware-supported scale once BE supports it
26952697
constexpr int16_t scale = 0;
@@ -2820,10 +2822,9 @@ gather_impl(AccessorT acc, simd<OffsetT, N / VS> byte_offsets,
28202822
/// @return is a vector of type T and size N * NElts.
28212823
///
28222824
template <typename T, int NElts, lsc_data_size DS, int N>
2823-
__ESIMD_API __ESIMD_NS::simd<T, N * NElts>
2824-
slm_gather_impl(__ESIMD_NS::simd<uint32_t, N> offsets,
2825-
__ESIMD_NS::simd_mask<N> pred,
2826-
__ESIMD_NS::simd<T, N * NElts> pass_thru) {
2825+
__ESIMD_API simd<T, N * NElts> slm_gather_impl(simd<uint32_t, N> offsets,
2826+
simd_mask<N> pred,
2827+
simd<T, N * NElts> pass_thru) {
28272828
check_lsc_vector_size<NElts>();
28282829
check_lsc_data_size<T, DS>();
28292830
constexpr uint16_t AddressScale = 1;
@@ -2832,16 +2833,46 @@ slm_gather_impl(__ESIMD_NS::simd<uint32_t, N> offsets,
28322833
constexpr lsc_vector_size LSCVS = to_lsc_vector_size<NElts>();
28332834
constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
28342835
using MsgT = typename lsc_expand_type<T>::type;
2835-
__ESIMD_NS::simd<MsgT, N * NElts> PassThruExpanded =
2836-
lsc_format_input<MsgT>(pass_thru);
2837-
__ESIMD_NS::simd<MsgT, N * NElts> Result =
2836+
simd<MsgT, N * NElts> PassThruExpanded = lsc_format_input<MsgT>(pass_thru);
2837+
simd<MsgT, N * NElts> Result =
28382838
__esimd_lsc_load_merge_slm<MsgT, cache_hint::none, cache_hint::none,
28392839
AddressScale, ImmOffset, EDS, LSCVS,
28402840
Transposed, N>(pred.data(), offsets.data(),
28412841
PassThruExpanded.data());
28422842
return lsc_format_ret<T>(Result);
28432843
}
28442844

2845+
/// SLM scatter implementation.
2846+
/// Supported platforms: DG2, PVC
2847+
/// VISA instruction: lsc_store.slm
2848+
///
2849+
/// Scatters elements located to slm.
2850+
///
2851+
/// @tparam T is element type.
2852+
/// @tparam NElts is the number of elements to store per address.
2853+
/// @tparam DS is the data size.
2854+
/// @tparam N is the number of channels (platform dependent).
2855+
/// @param offsets is the zero-based offsets for SLM buffer in bytes.
2856+
/// @param vals is values to store.
2857+
/// @param pred is predicates.
2858+
///
2859+
template <typename T, int NElts, lsc_data_size DS, int N>
2860+
__ESIMD_API void slm_scatter_impl(simd<uint32_t, N> offsets,
2861+
simd<T, N * NElts> vals, simd_mask<N> pred) {
2862+
check_lsc_vector_size<NElts>();
2863+
check_lsc_data_size<T, DS>();
2864+
constexpr uint16_t AddressScale = 1;
2865+
constexpr int ImmOffset = 0;
2866+
constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
2867+
constexpr lsc_vector_size LSCVS = to_lsc_vector_size<NElts>();
2868+
constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
2869+
using MsgT = typename lsc_expand_type<T>::type;
2870+
simd<MsgT, N * NElts> Tmp = lsc_format_input<MsgT, T>(vals);
2871+
__esimd_lsc_store_slm<MsgT, cache_hint::none, cache_hint::none, AddressScale,
2872+
ImmOffset, EDS, LSCVS, Transposed, N>(
2873+
pred.data(), offsets.data(), Tmp.data());
2874+
}
2875+
28452876
} // namespace detail
28462877

28472878
/// @endcond ESIMD_DETAIL
@@ -3903,7 +3934,7 @@ slm_gather(simd<uint32_t, N / VS> byte_offsets, simd_mask<N / VS> mask,
39033934
static_assert(Alignment >= sizeof(T),
39043935
"slm_gather() requires at least element-size alignment");
39053936

3906-
if constexpr (VS > 1 || (!detail::isPowerOf2(N, 32) &&
3937+
if constexpr (VS > 1 || (!(detail::isPowerOf2(N, 32) && sizeof(T) <= 4) &&
39073938
!detail::isMaskedGatherScatterLLVMAvailable())) {
39083939
simd<T, N> PassThru; // Intentionally undefined
39093940
return detail::slm_gather_impl<T, VS, detail::lsc_data_size::default_size>(
@@ -4118,7 +4149,7 @@ slm_gather(OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
41184149
/// @param byte_offsets the vector of 32-bit offsets in bytes.
41194150
/// For each i, (byte_offsets[i]) must be element size aligned.
41204151
/// @param props The optional compile-time properties. Only 'alignment'
4121-
/// and cache hint properties are used.
4152+
/// property is used.
41224153
/// @return A vector of elements read.
41234154
template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
41244155
typename PropertyListT =
@@ -4141,17 +4172,156 @@ template <typename T> __ESIMD_API T slm_scalar_load(uint32_t offset) {
41414172
return Res[0];
41424173
}
41434174

4144-
/// Scatter operation over the Shared Local Memory.
4145-
/// This API has almost the same interface as the @ref accessor_scatter
4146-
/// "accessor-based scatter", except that it does not have the accessor and
4147-
/// the global offset parameters.
4148-
///
4149-
template <typename T, int N>
4150-
__ESIMD_API std::enable_if_t<(N == 1 || N == 8 || N == 16 || N == 32) &&
4151-
(sizeof(T) <= 4)>
4152-
slm_scatter(simd<uint32_t, N> offsets, simd<T, N> vals, simd_mask<N> mask = 1) {
4153-
detail::LocalAccessorMarker acc;
4154-
detail::scatter_impl<T, N>(acc, vals, offsets, 0, mask);
4175+
/// template <typename T, int N, int VS = 1,
4176+
/// typename PropertyListT = empty_properties_t>
4177+
/// void slm_scatter(simd<uint32_t, N / VS> byte_offsets,
4178+
/// simd<T, N> vals, simd_mask<N / VS> mask,
4179+
/// PropertyListT props = {}); // (slm-sc-1)
4180+
/// void slm_scatter(simd<uint32_t, N / VS> byte_offsets,
4181+
/// simd<T, N> vals, PropertyListT props = {}); // (slm-sc-2)
4182+
///
4183+
/// The next 2 functions are variations of the first 2 above (slm-sc-1,2)
4184+
/// and were added only to support simd_view instead of simd for byte_offsets.
4185+
/// template <typename T, int N, int VS = 1, typename OffsetObjT,
4186+
/// typename OffsetRegionT, typename PropertyListT = empty_props_t>
4187+
/// void slm_scatter(OffsetSimdViewT byte_offsets,
4188+
/// simd<T, N> vals, simd_mask<N / VS> mask,
4189+
/// PropertyListT props = {}); // (slm-sc-3)
4190+
/// void slm_scatter(OffsetSimdViewT byte_offsets,
4191+
/// simd<T, N> vals, PropertyListT props = {}); // (slm-sc-4)
4192+
4193+
/// template <typename T, int N, int VS = 1,
4194+
/// typename PropertyListT = empty_properties_t>
4195+
/// void slm_scatter(simd<uint32_t, N / VS> byte_offsets,
4196+
/// simd<T, N> vals, simd_mask<N / VS> mask,
4197+
/// PropertyListT props = {}); // (slm-sc-1)
4198+
/// Stores ("scatters") elements of the type 'T' to Shared Local Memory
4199+
/// locations addressed by byte offsets \p byte_offsets. Storage of any element
4200+
/// can be disabled via the input vector of predicates \p mask.
4201+
/// If mask[i] is unset, then the storage to (byte_offsets[i]) is skipped.
4202+
/// @tparam T Element type.
4203+
/// @tparam N Number of elements to read.
4204+
/// @tparam VS Vector size. It can also be read as the number of writes per each
4205+
/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported
4206+
/// only on DG2 and PVC and only for 4- and 8-byte element vectors.
4207+
/// @param byte_offsets the vector of 32-bit offsets in bytes.
4208+
/// For each i, (byte_offsets[i]) must be element size aligned.
4209+
/// If the alignment property is not passed, then it is assumed that each
4210+
/// accessed address is aligned by element-size.
4211+
/// @param vals The vector of values to store.
4212+
/// @param mask The access mask, defaults to all 1s.
4213+
/// @param props The optional compile-time properties. Only 'alignment' property
4214+
/// is used.
4215+
template <typename T, int N, int VS = 1,
4216+
typename PropertyListT =
4217+
ext::oneapi::experimental::detail::empty_properties_t>
4218+
__ESIMD_API std::enable_if_t<
4219+
ext::oneapi::experimental::is_property_list_v<PropertyListT>>
4220+
slm_scatter(simd<uint32_t, N / VS> byte_offsets, simd<T, N> vals,
4221+
simd_mask<N / VS> mask, PropertyListT props = {}) {
4222+
static_assert(N / VS >= 1 && N % VS == 0, "N must be divisible by VS");
4223+
4224+
constexpr size_t Alignment =
4225+
detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
4226+
static_assert(Alignment >= sizeof(T),
4227+
"slm_scatter() requires at least element-size alignment");
4228+
4229+
// Use LSC lowering if VS > 1.
4230+
if constexpr (VS > 1 || !(detail::isPowerOf2(N, 32) && sizeof(T) <= 4)) {
4231+
__ESIMD_DNS::slm_scatter_impl<T, VS, detail::lsc_data_size::default_size>(
4232+
byte_offsets, vals, mask);
4233+
} else {
4234+
detail::LocalAccessorMarker acc;
4235+
detail::scatter_impl<T, N>(acc, vals, byte_offsets, 0, mask);
4236+
}
4237+
}
4238+
4239+
/// template <typename T, int N, int VS = 1,
4240+
/// typename PropertyListT = empty_properties_t>
4241+
/// void slm_scatter(simd<uint32_t, N / VS> byte_offsets, simd<T, N> vals,
4242+
/// PropertyListT props = {}); // (slm-sc-2)
4243+
/// Stores ("scatters") elements of the type 'T' to Shared Local Memory
4244+
/// locations addressed by byte offsets \p byte_offsets.
4245+
/// @tparam T Element type.
4246+
/// @tparam N Number of elements to read.
4247+
/// @tparam VS Vector size. It can also be read as the number of reads per each
4248+
/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported
4249+
/// only on DG2 and PVC and only for 4- and 8-byte element vectors..
4250+
/// @param byte_offsets the vector of 32-bit offsets in bytes.
4251+
/// For each i, (byte_offsets[i]) must be element size aligned.
4252+
/// @param vals The vector of values to store.
4253+
/// @param props The optional compile-time properties. Only 'alignment'
4254+
/// property is used.
4255+
template <typename T, int N, int VS = 1,
4256+
typename PropertyListT =
4257+
ext::oneapi::experimental::detail::empty_properties_t>
4258+
__ESIMD_API std::enable_if_t<
4259+
ext::oneapi::experimental::is_property_list_v<PropertyListT>>
4260+
slm_scatter(simd<uint32_t, N / VS> byte_offsets, simd<T, N> vals,
4261+
PropertyListT props = {}) {
4262+
simd_mask<N / VS> Mask = 1;
4263+
slm_scatter<T, N, VS>(byte_offsets, vals, Mask, props);
4264+
}
4265+
4266+
/// template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
4267+
/// typename PropertyListT = empty_props_t>
4268+
/// void slm_scatter(
4269+
/// OffsetSimdViewT byte_offsets, simd<T, N> vals,
4270+
/// simd_mask<N / VS> mask, PropertyListT props = {}); // (slm-sc-3)
4271+
/// Stores ("scatters") elements of the type 'T' to Shared Local Memory
4272+
/// locations addressed by byte offsets \p byte_offsets.
4273+
/// Storage to any element's memory location can be disabled via the
4274+
/// input vector of predicates \p mask. If mask[i] is unset, then the storage to
4275+
/// (byte_offsets[i]) is skipped.
4276+
/// @tparam T Element type.
4277+
/// @tparam N Number of elements to read.
4278+
/// @tparam VS Vector size. It can also be read as the number of reads per each
4279+
/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported
4280+
/// only on DG2 and PVC and only for 4- and 8-byte element vectors..
4281+
/// @param byte_offsets the vector of 32-bit offsets in bytes.
4282+
/// For each i, (byte_offsets[i]) must be element size aligned.
4283+
/// If the alignment property is not passed, then it is assumed that each
4284+
/// accessed address is aligned by element-size.
4285+
/// @param vals The vector of values to store.
4286+
/// @param mask The access mask, defaults to all 1s.
4287+
/// @param props The optional compile-time properties. Only 'alignment'
4288+
/// property is used.
4289+
template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
4290+
typename PropertyListT =
4291+
ext::oneapi::experimental::detail::empty_properties_t>
4292+
__ESIMD_API std::enable_if_t<
4293+
detail::is_simd_view_type_v<OffsetSimdViewT> &&
4294+
ext::oneapi::experimental::is_property_list_v<PropertyListT>>
4295+
slm_scatter(OffsetSimdViewT byte_offsets, simd<T, N> vals,
4296+
simd_mask<N / VS> mask, PropertyListT props = {}) {
4297+
slm_scatter<T, N, VS>(byte_offsets.read(), vals, mask, props);
4298+
}
4299+
4300+
/// void slm_scatter(
4301+
/// OffsetSimdViewT byte_offsets, simd<T, N> vals,
4302+
/// PropertyListT props = {}); // (slm-sc-4)
4303+
/// Loads ("gathers") elements of the type 'T' from Shared Local Memory
4304+
/// locations addressed by byte offsets \p byte_offsets, and returns the loaded
4305+
/// elements.
4306+
/// @tparam T Element type.
4307+
/// @tparam N Number of elements to read.
4308+
/// @tparam VS Vector size. It can also be read as the number of reads per each
4309+
/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported
4310+
/// only on DG2 and PVC and only for 4- and 8-byte element vectors.
4311+
/// @param byte_offsets the vector of 32-bit offsets in bytes.
4312+
/// For each i, (byte_offsets[i]) must be element size aligned.
4313+
/// @param vals The vector of values to store.
4314+
/// @param props The optional compile-time properties. Only 'alignment'
4315+
/// property is used.
4316+
template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
4317+
typename PropertyListT =
4318+
ext::oneapi::experimental::detail::empty_properties_t>
4319+
__ESIMD_API std::enable_if_t<
4320+
detail::is_simd_view_type_v<OffsetSimdViewT> &&
4321+
ext::oneapi::experimental::is_property_list_v<PropertyListT>>
4322+
slm_scatter(OffsetSimdViewT byte_offsets, simd<T, N> vals,
4323+
PropertyListT props = {}) {
4324+
return slm_scatter<T, N, VS>(byte_offsets.read(), vals, props);
41554325
}
41564326

41574327
/// Store a scalar value into the Shared Local Memory.

sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1424,21 +1424,7 @@ template <typename T, int NElts = 1,
14241424
__ESIMD_API void lsc_slm_scatter(__ESIMD_NS::simd<uint32_t, N> offsets,
14251425
__ESIMD_NS::simd<T, N * NElts> vals,
14261426
__ESIMD_NS::simd_mask<N> pred = 1) {
1427-
detail::check_lsc_vector_size<NElts>();
1428-
detail::check_lsc_data_size<T, DS>();
1429-
constexpr uint16_t _AddressScale = 1;
1430-
constexpr int _ImmOffset = 0;
1431-
constexpr lsc_data_size _DS =
1432-
detail::expand_data_size(detail::finalize_data_size<T, DS>());
1433-
constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
1434-
constexpr detail::lsc_data_order _Transposed =
1435-
detail::lsc_data_order::nontranspose;
1436-
using MsgT = typename detail::lsc_expand_type<T>::type;
1437-
using CstT = __ESIMD_DNS::uint_type_t<sizeof(T)>;
1438-
__ESIMD_NS::simd<MsgT, N * NElts> Tmp = vals.template bit_cast_view<CstT>();
1439-
__esimd_lsc_store_slm<MsgT, cache_hint::none, cache_hint::none, _AddressScale,
1440-
_ImmOffset, _DS, _VS, _Transposed, N>(
1441-
pred.data(), offsets.data(), Tmp.data());
1427+
__ESIMD_DNS::slm_scatter_impl<T, NElts, DS>(offsets, vals, pred);
14421428
}
14431429

14441430
/// Transposed SLM scatter with 1 channel.

0 commit comments

Comments
 (0)