@@ -2685,11 +2685,13 @@ block_store(AccessorT acc, simd<T, N> vals, simd_mask<1> pred,
2685
2685
namespace detail {
2686
2686
template <typename T, int N, typename AccessorTy>
2687
2687
ESIMD_INLINE ESIMD_NODEBUG std::enable_if_t <
2688
- (sizeof (T) <= 4 ) && (N == 1 || N == 8 || N == 16 || N == 32 ) &&
2689
- (std::is_same_v<detail::LocalAccessorMarker, AccessorTy> ||
2690
- is_accessor_with_v<AccessorTy, detail::accessor_mode_cap::can_write>)>
2688
+ std::is_same_v<detail::LocalAccessorMarker, AccessorTy> ||
2689
+ is_accessor_with_v<AccessorTy, detail::accessor_mode_cap::can_write>>
2691
2690
scatter_impl (AccessorTy acc, simd<T, N> vals, simd<uint32_t , N> offsets,
2692
2691
uint32_t glob_offset, simd_mask<N> mask) {
2692
+
2693
+ static_assert (sizeof (T) <= 4 && detail::isPowerOf2 (N, 32 ),
2694
+ " Unexpected type or vector length" );
2693
2695
constexpr int TypeSizeLog2 = detail::ElemsPerAddrEncoding<sizeof (T)>();
2694
2696
// TODO (performance) use hardware-supported scale once BE supports it
2695
2697
constexpr int16_t scale = 0 ;
@@ -2820,10 +2822,9 @@ gather_impl(AccessorT acc, simd<OffsetT, N / VS> byte_offsets,
2820
2822
// / @return is a vector of type T and size N * NElts.
2821
2823
// /
2822
2824
template <typename T, int NElts, lsc_data_size DS, int N>
2823
- __ESIMD_API __ESIMD_NS::simd<T, N * NElts>
2824
- slm_gather_impl (__ESIMD_NS::simd<uint32_t , N> offsets,
2825
- __ESIMD_NS::simd_mask<N> pred,
2826
- __ESIMD_NS::simd<T, N * NElts> pass_thru) {
2825
+ __ESIMD_API simd<T, N * NElts> slm_gather_impl (simd<uint32_t , N> offsets,
2826
+ simd_mask<N> pred,
2827
+ simd<T, N * NElts> pass_thru) {
2827
2828
check_lsc_vector_size<NElts>();
2828
2829
check_lsc_data_size<T, DS>();
2829
2830
constexpr uint16_t AddressScale = 1 ;
@@ -2832,16 +2833,46 @@ slm_gather_impl(__ESIMD_NS::simd<uint32_t, N> offsets,
2832
2833
constexpr lsc_vector_size LSCVS = to_lsc_vector_size<NElts>();
2833
2834
constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
2834
2835
using MsgT = typename lsc_expand_type<T>::type;
2835
- __ESIMD_NS::simd<MsgT, N * NElts> PassThruExpanded =
2836
- lsc_format_input<MsgT>(pass_thru);
2837
- __ESIMD_NS::simd<MsgT, N * NElts> Result =
2836
+ simd<MsgT, N * NElts> PassThruExpanded = lsc_format_input<MsgT>(pass_thru);
2837
+ simd<MsgT, N * NElts> Result =
2838
2838
__esimd_lsc_load_merge_slm<MsgT, cache_hint::none, cache_hint::none,
2839
2839
AddressScale, ImmOffset, EDS, LSCVS,
2840
2840
Transposed, N>(pred.data (), offsets.data (),
2841
2841
PassThruExpanded.data ());
2842
2842
return lsc_format_ret<T>(Result);
2843
2843
}
2844
2844
2845
+ // / SLM scatter implementation.
2846
+ // / Supported platforms: DG2, PVC
2847
+ // / VISA instruction: lsc_store.slm
2848
+ // /
2849
+ // / Scatters elements located to slm.
2850
+ // /
2851
+ // / @tparam T is element type.
2852
+ // / @tparam NElts is the number of elements to store per address.
2853
+ // / @tparam DS is the data size.
2854
+ // / @tparam N is the number of channels (platform dependent).
2855
+ // / @param offsets is the zero-based offsets for SLM buffer in bytes.
2856
+ // / @param vals is values to store.
2857
+ // / @param pred is predicates.
2858
+ // /
2859
+ template <typename T, int NElts, lsc_data_size DS, int N>
2860
+ __ESIMD_API void slm_scatter_impl (simd<uint32_t , N> offsets,
2861
+ simd<T, N * NElts> vals, simd_mask<N> pred) {
2862
+ check_lsc_vector_size<NElts>();
2863
+ check_lsc_data_size<T, DS>();
2864
+ constexpr uint16_t AddressScale = 1 ;
2865
+ constexpr int ImmOffset = 0 ;
2866
+ constexpr lsc_data_size EDS = expand_data_size (finalize_data_size<T, DS>());
2867
+ constexpr lsc_vector_size LSCVS = to_lsc_vector_size<NElts>();
2868
+ constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
2869
+ using MsgT = typename lsc_expand_type<T>::type;
2870
+ simd<MsgT, N * NElts> Tmp = lsc_format_input<MsgT, T>(vals);
2871
+ __esimd_lsc_store_slm<MsgT, cache_hint::none, cache_hint::none, AddressScale,
2872
+ ImmOffset, EDS, LSCVS, Transposed, N>(
2873
+ pred.data (), offsets.data (), Tmp.data ());
2874
+ }
2875
+
2845
2876
} // namespace detail
2846
2877
2847
2878
// / @endcond ESIMD_DETAIL
@@ -3903,7 +3934,7 @@ slm_gather(simd<uint32_t, N / VS> byte_offsets, simd_mask<N / VS> mask,
3903
3934
static_assert (Alignment >= sizeof (T),
3904
3935
" slm_gather() requires at least element-size alignment" );
3905
3936
3906
- if constexpr (VS > 1 || (!detail::isPowerOf2 (N, 32 ) &&
3937
+ if constexpr (VS > 1 || (!( detail::isPowerOf2 (N, 32 ) && sizeof (T) <= 4 ) &&
3907
3938
!detail::isMaskedGatherScatterLLVMAvailable ())) {
3908
3939
simd<T, N> PassThru; // Intentionally undefined
3909
3940
return detail::slm_gather_impl<T, VS, detail::lsc_data_size::default_size>(
@@ -4118,7 +4149,7 @@ slm_gather(OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
4118
4149
// / @param byte_offsets the vector of 32-bit offsets in bytes.
4119
4150
// / For each i, (byte_offsets[i]) must be element size aligned.
4120
4151
// / @param props The optional compile-time properties. Only 'alignment'
4121
- // / and cache hint properties are used.
4152
+ // / property is used.
4122
4153
// / @return A vector of elements read.
4123
4154
template <typename T, int N, int VS = 1 , typename OffsetSimdViewT,
4124
4155
typename PropertyListT =
@@ -4141,17 +4172,156 @@ template <typename T> __ESIMD_API T slm_scalar_load(uint32_t offset) {
4141
4172
return Res[0 ];
4142
4173
}
4143
4174
4144
- // / Scatter operation over the Shared Local Memory.
4145
- // / This API has almost the same interface as the @ref accessor_scatter
4146
- // / "accessor-based scatter", except that it does not have the accessor and
4147
- // / the global offset parameters.
4148
- // /
4149
- template <typename T, int N>
4150
- __ESIMD_API std::enable_if_t <(N == 1 || N == 8 || N == 16 || N == 32 ) &&
4151
- (sizeof (T) <= 4)>
4152
- slm_scatter(simd<uint32_t , N> offsets, simd<T, N> vals, simd_mask<N> mask = 1 ) {
4153
- detail::LocalAccessorMarker acc;
4154
- detail::scatter_impl<T, N>(acc, vals, offsets, 0 , mask);
4175
+ // / template <typename T, int N, int VS = 1,
4176
+ // / typename PropertyListT = empty_properties_t>
4177
+ // / void slm_scatter(simd<uint32_t, N / VS> byte_offsets,
4178
+ // / simd<T, N> vals, simd_mask<N / VS> mask,
4179
+ // / PropertyListT props = {}); // (slm-sc-1)
4180
+ // / void slm_scatter(simd<uint32_t, N / VS> byte_offsets,
4181
+ // / simd<T, N> vals, PropertyListT props = {}); // (slm-sc-2)
4182
+ // /
4183
+ // / The next 2 functions are variations of the first 2 above (slm-sc-1,2)
4184
+ // / and were added only to support simd_view instead of simd for byte_offsets.
4185
+ // / template <typename T, int N, int VS = 1, typename OffsetObjT,
4186
+ // / typename OffsetRegionT, typename PropertyListT = empty_props_t>
4187
+ // / void slm_scatter(OffsetSimdViewT byte_offsets,
4188
+ // / simd<T, N> vals, simd_mask<N / VS> mask,
4189
+ // / PropertyListT props = {}); // (slm-sc-3)
4190
+ // / void slm_scatter(OffsetSimdViewT byte_offsets,
4191
+ // / simd<T, N> vals, PropertyListT props = {}); // (slm-sc-4)
4192
+
4193
+ // / template <typename T, int N, int VS = 1,
4194
+ // / typename PropertyListT = empty_properties_t>
4195
+ // / void slm_scatter(simd<uint32_t, N / VS> byte_offsets,
4196
+ // / simd<T, N> vals, simd_mask<N / VS> mask,
4197
+ // / PropertyListT props = {}); // (slm-sc-1)
4198
+ // / Stores ("scatters") elements of the type 'T' to Shared Local Memory
4199
+ // / locations addressed by byte offsets \p byte_offsets. Storage of any element
4200
+ // / can be disabled via the input vector of predicates \p mask.
4201
+ // / If mask[i] is unset, then the storage to (byte_offsets[i]) is skipped.
4202
+ // / @tparam T Element type.
4203
+ // / @tparam N Number of elements to read.
4204
+ // / @tparam VS Vector size. It can also be read as the number of writes per each
4205
+ // / address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported
4206
+ // / only on DG2 and PVC and only for 4- and 8-byte element vectors.
4207
+ // / @param byte_offsets the vector of 32-bit offsets in bytes.
4208
+ // / For each i, (byte_offsets[i]) must be element size aligned.
4209
+ // / If the alignment property is not passed, then it is assumed that each
4210
+ // / accessed address is aligned by element-size.
4211
+ // / @param vals The vector of values to store.
4212
+ // / @param mask The access mask, defaults to all 1s.
4213
+ // / @param props The optional compile-time properties. Only 'alignment' property
4214
+ // / is used.
4215
+ template <typename T, int N, int VS = 1 ,
4216
+ typename PropertyListT =
4217
+ ext::oneapi::experimental::detail::empty_properties_t >
4218
+ __ESIMD_API std::enable_if_t <
4219
+ ext::oneapi::experimental::is_property_list_v<PropertyListT>>
4220
+ slm_scatter (simd<uint32_t , N / VS> byte_offsets, simd<T, N> vals,
4221
+ simd_mask<N / VS> mask, PropertyListT props = {}) {
4222
+ static_assert (N / VS >= 1 && N % VS == 0 , " N must be divisible by VS" );
4223
+
4224
+ constexpr size_t Alignment =
4225
+ detail::getPropertyValue<PropertyListT, alignment_key>(sizeof (T));
4226
+ static_assert (Alignment >= sizeof (T),
4227
+ " slm_scatter() requires at least element-size alignment" );
4228
+
4229
+ // Use LSC lowering if VS > 1.
4230
+ if constexpr (VS > 1 || !(detail::isPowerOf2 (N, 32 ) && sizeof (T) <= 4 )) {
4231
+ __ESIMD_DNS::slm_scatter_impl<T, VS, detail::lsc_data_size::default_size>(
4232
+ byte_offsets, vals, mask);
4233
+ } else {
4234
+ detail::LocalAccessorMarker acc;
4235
+ detail::scatter_impl<T, N>(acc, vals, byte_offsets, 0 , mask);
4236
+ }
4237
+ }
4238
+
4239
+ // / template <typename T, int N, int VS = 1,
4240
+ // / typename PropertyListT = empty_properties_t>
4241
+ // / void slm_scatter(simd<uint32_t, N / VS> byte_offsets, simd<T, N> vals,
4242
+ // / PropertyListT props = {}); // (slm-sc-2)
4243
+ // / Stores ("scatters") elements of the type 'T' to Shared Local Memory
4244
+ // / locations addressed by byte offsets \p byte_offsets.
4245
+ // / @tparam T Element type.
4246
+ // / @tparam N Number of elements to read.
4247
+ // / @tparam VS Vector size. It can also be read as the number of reads per each
4248
+ // / address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported
4249
+ // / only on DG2 and PVC and only for 4- and 8-byte element vectors..
4250
+ // / @param byte_offsets the vector of 32-bit offsets in bytes.
4251
+ // / For each i, (byte_offsets[i]) must be element size aligned.
4252
+ // / @param vals The vector of values to store.
4253
+ // / @param props The optional compile-time properties. Only 'alignment'
4254
+ // / property is used.
4255
+ template <typename T, int N, int VS = 1 ,
4256
+ typename PropertyListT =
4257
+ ext::oneapi::experimental::detail::empty_properties_t >
4258
+ __ESIMD_API std::enable_if_t <
4259
+ ext::oneapi::experimental::is_property_list_v<PropertyListT>>
4260
+ slm_scatter (simd<uint32_t , N / VS> byte_offsets, simd<T, N> vals,
4261
+ PropertyListT props = {}) {
4262
+ simd_mask<N / VS> Mask = 1 ;
4263
+ slm_scatter<T, N, VS>(byte_offsets, vals, Mask, props);
4264
+ }
4265
+
4266
+ // / template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
4267
+ // / typename PropertyListT = empty_props_t>
4268
+ // / void slm_scatter(
4269
+ // / OffsetSimdViewT byte_offsets, simd<T, N> vals,
4270
+ // / simd_mask<N / VS> mask, PropertyListT props = {}); // (slm-sc-3)
4271
+ // / Stores ("scatters") elements of the type 'T' to Shared Local Memory
4272
+ // / locations addressed by byte offsets \p byte_offsets.
4273
+ // / Storage to any element's memory location can be disabled via the
4274
+ // / input vector of predicates \p mask. If mask[i] is unset, then the storage to
4275
+ // / (byte_offsets[i]) is skipped.
4276
+ // / @tparam T Element type.
4277
+ // / @tparam N Number of elements to read.
4278
+ // / @tparam VS Vector size. It can also be read as the number of reads per each
4279
+ // / address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported
4280
+ // / only on DG2 and PVC and only for 4- and 8-byte element vectors..
4281
+ // / @param byte_offsets the vector of 32-bit offsets in bytes.
4282
+ // / For each i, (byte_offsets[i]) must be element size aligned.
4283
+ // / If the alignment property is not passed, then it is assumed that each
4284
+ // / accessed address is aligned by element-size.
4285
+ // / @param vals The vector of values to store.
4286
+ // / @param mask The access mask, defaults to all 1s.
4287
+ // / @param props The optional compile-time properties. Only 'alignment'
4288
+ // / property is used.
4289
+ template <typename T, int N, int VS = 1 , typename OffsetSimdViewT,
4290
+ typename PropertyListT =
4291
+ ext::oneapi::experimental::detail::empty_properties_t >
4292
+ __ESIMD_API std::enable_if_t <
4293
+ detail::is_simd_view_type_v<OffsetSimdViewT> &&
4294
+ ext::oneapi::experimental::is_property_list_v<PropertyListT>>
4295
+ slm_scatter (OffsetSimdViewT byte_offsets, simd<T, N> vals,
4296
+ simd_mask<N / VS> mask, PropertyListT props = {}) {
4297
+ slm_scatter<T, N, VS>(byte_offsets.read (), vals, mask, props);
4298
+ }
4299
+
4300
+ // / void slm_scatter(
4301
+ // / OffsetSimdViewT byte_offsets, simd<T, N> vals,
4302
+ // / PropertyListT props = {}); // (slm-sc-4)
4303
+ // / Loads ("gathers") elements of the type 'T' from Shared Local Memory
4304
+ // / locations addressed by byte offsets \p byte_offsets, and returns the loaded
4305
+ // / elements.
4306
+ // / @tparam T Element type.
4307
+ // / @tparam N Number of elements to read.
4308
+ // / @tparam VS Vector size. It can also be read as the number of reads per each
4309
+ // / address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported
4310
+ // / only on DG2 and PVC and only for 4- and 8-byte element vectors.
4311
+ // / @param byte_offsets the vector of 32-bit offsets in bytes.
4312
+ // / For each i, (byte_offsets[i]) must be element size aligned.
4313
+ // / @param vals The vector of values to store.
4314
+ // / @param props The optional compile-time properties. Only 'alignment'
4315
+ // / property is used.
4316
+ template <typename T, int N, int VS = 1 , typename OffsetSimdViewT,
4317
+ typename PropertyListT =
4318
+ ext::oneapi::experimental::detail::empty_properties_t >
4319
+ __ESIMD_API std::enable_if_t <
4320
+ detail::is_simd_view_type_v<OffsetSimdViewT> &&
4321
+ ext::oneapi::experimental::is_property_list_v<PropertyListT>>
4322
+ slm_scatter (OffsetSimdViewT byte_offsets, simd<T, N> vals,
4323
+ PropertyListT props = {}) {
4324
+ return slm_scatter<T, N, VS>(byte_offsets.read (), vals, props);
4155
4325
}
4156
4326
4157
4327
// / Store a scalar value into the Shared Local Memory.
0 commit comments