diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp index 0ccf403371948..14230b3994e33 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp @@ -69,7 +69,7 @@ __ESIMD_INTRIN void __esimd_raw_send_nbarrier_signal( /// /// @tparam Ty is element type. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam AddressScale is the address scale. /// @tparam ImmOffset is the immediate offset added to each address. /// @tparam DS is the data size. @@ -80,7 +80,7 @@ __ESIMD_INTRIN void __esimd_raw_send_nbarrier_signal( /// @param pred is predicates. /// @param offsets is the zero-based offsets in bytes. /// @param surf_ind is the surface index. -template pred, /// /// @tparam Ty is element type. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam DS is the data size. /// @tparam Transposed is the transposed version or not. /// @tparam NBlocks is the number of blocks. @@ -120,7 +120,7 @@ __esimd_lsc_prefetch_bti(__ESIMD_DNS::simd_mask_storage_t pred, /// otherwise, /// N = roundUpNextMultiple(BlockHeight, 4 / sizeof(T)) * /// getNextPowerOf2(BlockWidth) * NBlocks -template @@ -136,7 +136,7 @@ __esimd_lsc_load2d_stateless(__ESIMD_DNS::simd_mask_storage_t Pred, /// /// @tparam Ty is element type. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam DS is the data size. /// @tparam NBlocks is the number of blocks. /// @tparam Transposed is the transposed version or not. @@ -153,7 +153,7 @@ __esimd_lsc_load2d_stateless(__ESIMD_DNS::simd_mask_storage_t Pred, /// number of elements. /// @param Y is zero based Y-coordinate of the left upper rectangle corner in /// rows. -template @@ -168,7 +168,7 @@ __ESIMD_INTRIN void __esimd_lsc_prefetch2d_stateless( /// /// @tparam Ty is element type. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam DS is the data size. /// @tparam Transposed is the transposed version or not. /// @tparam NBlocks is the number of blocks. @@ -190,7 +190,7 @@ __ESIMD_INTRIN void __esimd_lsc_prefetch2d_stateless( /// otherwise, /// N = roundUpNextMultiple(BlockHeight, 4 / sizeof(T)) * /// getNextPowerOf2(BlockWidth) * NBlocks -template diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp index abc07403a829b..f52cf8ab855bd 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp @@ -481,55 +481,55 @@ template constexpr uint32_t get_lsc_data_size() { } } -template +template constexpr uint32_t get_lsc_load_cache_mask() { if constexpr (L1H == cache_hint::read_invalidate && - L3H == cache_hint::cached) { + L2H == cache_hint::cached) { return 7; } - if constexpr (L1H == cache_hint::streaming && L3H == cache_hint::cached) { + if constexpr (L1H == cache_hint::streaming && L2H == cache_hint::cached) { return 6; } - if constexpr (L1H == cache_hint::streaming && L3H == cache_hint::uncached) { + if constexpr (L1H == cache_hint::streaming && L2H == cache_hint::uncached) { return 5; } - if constexpr (L1H == cache_hint::cached && L3H == cache_hint::cached) { + if constexpr (L1H == cache_hint::cached && L2H == cache_hint::cached) { return 4; } - if constexpr (L1H == cache_hint::cached && L3H == cache_hint::uncached) { + if constexpr (L1H == cache_hint::cached && L2H == cache_hint::uncached) { return 3; } - if constexpr (L1H == cache_hint::uncached && L3H == cache_hint::cached) { + if constexpr (L1H == cache_hint::uncached && L2H == cache_hint::cached) { return 2; } - if constexpr (L1H == cache_hint::uncached && L3H == cache_hint::uncached) { + if constexpr (L1H == cache_hint::uncached && L2H == cache_hint::uncached) { return 1; } return 0; } -template +template constexpr uint32_t get_lsc_store_cache_mask() { - if constexpr (L1H == cache_hint::write_back && L3H == cache_hint::cached) { + if constexpr (L1H == cache_hint::write_back && L2H == cache_hint::cached) { return 7; } - if constexpr (L1H == cache_hint::streaming && L3H == cache_hint::cached) { + if constexpr (L1H == cache_hint::streaming && L2H == cache_hint::cached) { return 6; } - if constexpr (L1H == cache_hint::streaming && L3H == cache_hint::uncached) { + if constexpr (L1H == cache_hint::streaming && L2H == cache_hint::uncached) { return 5; } - if constexpr (L1H == cache_hint::write_through && L3H == cache_hint::cached) { + if constexpr (L1H == cache_hint::write_through && L2H == cache_hint::cached) { return 4; } if constexpr (L1H == cache_hint::write_through && - L3H == cache_hint::uncached) { + L2H == cache_hint::uncached) { return 3; } - if constexpr (L1H == cache_hint::uncached && L3H == cache_hint::cached) { + if constexpr (L1H == cache_hint::uncached && L2H == cache_hint::cached) { return 2; } - if constexpr (L1H == cache_hint::uncached && L3H == cache_hint::uncached) { + if constexpr (L1H == cache_hint::uncached && L2H == cache_hint::uncached) { return 1; } return 0; @@ -647,7 +647,7 @@ lsc_slm_block_load(uint32_t offset, __ESIMD_NS::simd_mask<1> pred, /// @tparam NElts is the number of elements to load per address. /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam N is the number of channels (platform dependent). /// @param p is the base pointer. /// @param offsets is the zero-based offsets in bytes. @@ -656,12 +656,12 @@ lsc_slm_block_load(uint32_t offset, __ESIMD_NS::simd_mask<1> pred, /// template __ESIMD_API __ESIMD_NS::simd lsc_gather(const T *p, __ESIMD_NS::simd offsets, __ESIMD_NS::simd_mask pred = 1) { - return __ESIMD_DNS::gather_impl(p, offsets, pred); + return __ESIMD_DNS::gather_impl(p, offsets, pred); } /// USM pointer gather. @@ -675,7 +675,7 @@ lsc_gather(const T *p, __ESIMD_NS::simd offsets, /// @tparam NElts is the number of elements to load per address. /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam N is the number of channels (platform dependent). /// @param p is the base pointer. /// @param offsets is the zero-based offsets in bytes. @@ -686,58 +686,58 @@ lsc_gather(const T *p, __ESIMD_NS::simd offsets, /// template __ESIMD_API __ESIMD_NS::simd lsc_gather(const T *p, __ESIMD_NS::simd offsets, __ESIMD_NS::simd_mask pred, __ESIMD_NS::simd pass_thru) { - return __ESIMD_DNS::gather_impl(p, offsets, pred, + return __ESIMD_DNS::gather_impl(p, offsets, pred, pass_thru); } template __ESIMD_API __ESIMD_NS::simd lsc_gather(const T *p, __ESIMD_NS::simd_view offsets, __ESIMD_NS::simd_mask pred = 1) { - return lsc_gather(p, offsets.read(), pred); + return lsc_gather(p, offsets.read(), pred); } template __ESIMD_API __ESIMD_NS::simd lsc_gather(const T *p, __ESIMD_NS::simd_view offsets, __ESIMD_NS::simd_mask pred, __ESIMD_NS::simd pass_thru) { - return lsc_gather(p, offsets.read(), pred, + return lsc_gather(p, offsets.read(), pred, pass_thru); } template __ESIMD_API std::enable_if_t, __ESIMD_NS::simd> lsc_gather(const T *p, Toffset offset, __ESIMD_NS::simd_mask pred = 1) { - return lsc_gather( + return lsc_gather( p, __ESIMD_NS::simd(offset), pred); } template __ESIMD_API std::enable_if_t, __ESIMD_NS::simd> lsc_gather(const T *p, Toffset offset, __ESIMD_NS::simd_mask pred, __ESIMD_NS::simd pass_thru) { - return lsc_gather( + return lsc_gather( p, __ESIMD_NS::simd(offset), pred, pass_thru); } @@ -752,7 +752,7 @@ lsc_gather(const T *p, Toffset offset, __ESIMD_NS::simd_mask pred, /// @tparam NElts is the number of elements to load per address. /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam N is the number of channels (platform dependent). /// @tparam AccessorTy is the \ref sycl::accessor type. /// @param acc is the SYCL accessor. @@ -762,7 +762,7 @@ lsc_gather(const T *p, Toffset offset, __ESIMD_NS::simd_mask pred, /// template __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_device_accessor_with_v< @@ -772,11 +772,11 @@ __ESIMD_API __ESIMD_NS::simd<__ESIMD_DNS::DeviceAccessorOffsetT, N> offsets, __ESIMD_NS::simd_mask pred = 1) { #ifdef __ESIMD_FORCE_STATELESS_MEM - return lsc_gather( + return lsc_gather( reinterpret_cast(acc.get_pointer().get()), offsets, pred); #else __ESIMD_NS::simd PassThru; // Intentionally unitialized. - return __ESIMD_DNS::gather_impl( + return __ESIMD_DNS::gather_impl( acc, offsets, pred, PassThru); #endif // __ESIMD_FORCE_STATELESS_MEM } @@ -784,7 +784,7 @@ __ESIMD_API #ifdef __ESIMD_FORCE_STATELESS_MEM template __ESIMD_API std::enable_if_t< __ESIMD_DNS::is_device_accessor_with_v< @@ -793,14 +793,14 @@ __ESIMD_API std::enable_if_t< __ESIMD_NS::simd> lsc_gather(AccessorTy acc, __ESIMD_NS::simd offsets, __ESIMD_NS::simd_mask pred = 1) { - return lsc_gather( + return lsc_gather( acc, convert(offsets), pred); } #endif template __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_local_accessor_with_v< @@ -823,7 +823,7 @@ __ESIMD_API /// @tparam NElts is the number of elements to load per address. /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam N is the number of channels (platform dependent). /// @tparam AccessorTy is the \ref sycl::accessor type. /// @param acc is the SYCL accessor. @@ -835,7 +835,7 @@ __ESIMD_API /// template __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_device_accessor_with_v< @@ -846,11 +846,11 @@ __ESIMD_API __ESIMD_NS::simd_mask pred, __ESIMD_NS::simd pass_thru) { #ifdef __ESIMD_FORCE_STATELESS_MEM - return lsc_gather( + return lsc_gather( reinterpret_cast(acc.get_pointer().get()), offsets, pred, pass_thru); #else - return __ESIMD_DNS::gather_impl( + return __ESIMD_DNS::gather_impl( acc, offsets, pred, pass_thru); #endif // __ESIMD_FORCE_STATELESS_MEM } @@ -858,7 +858,7 @@ __ESIMD_API #ifdef __ESIMD_FORCE_STATELESS_MEM template __ESIMD_API std::enable_if_t< __ESIMD_DNS::is_device_accessor_with_v< @@ -868,14 +868,14 @@ __ESIMD_API std::enable_if_t< lsc_gather(AccessorTy acc, __ESIMD_NS::simd offsets, __ESIMD_NS::simd_mask pred, __ESIMD_NS::simd pass_thru) { - return lsc_gather( + return lsc_gather( acc, convert(offsets), pred, pass_thru); } #endif template __ESIMD_API std::enable_if_t< sycl::detail::acc_properties::is_local_accessor_v, @@ -912,7 +912,7 @@ lsc_gather(AccessorTy acc, __ESIMD_NS::simd offsets, /// @tparam NElts is the number of elements to load per address. /// @tparam DS is the data size (unused/obsolete). /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @param p is the base pointer. /// @param pred is operation predicate. Zero means operation is skipped /// entirely, non-zero - operation is performed. The default is '1' - @@ -923,13 +923,13 @@ lsc_gather(AccessorTy acc, __ESIMD_NS::simd offsets, /// are undefined. /// template __ESIMD_API std::enable_if_t<__ESIMD_NS::is_simd_flag_type_v, __ESIMD_NS::simd> lsc_block_load(const T *p, __ESIMD_NS::simd_mask<1> pred = 1, FlagsT flags = FlagsT{}) { - return __ESIMD_DNS::block_load_impl(p, pred, flags); + return __ESIMD_DNS::block_load_impl(p, pred, flags); } /// A variation of lsc_block_load without predicate parameter to simplify use @@ -953,7 +953,7 @@ lsc_block_load(const T *p, __ESIMD_NS::simd_mask<1> pred = 1, /// @tparam NElts is the number of elements to load per address. /// @tparam DS is the data size (unused/obsolete). /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @param p is the base pointer. /// @param flags is the alignment specifier type tag. /// @return is a vector of type T and size NElts. The elements of the @@ -961,12 +961,12 @@ lsc_block_load(const T *p, __ESIMD_NS::simd_mask<1> pred = 1, /// are undefined. /// template __ESIMD_API std::enable_if_t<__ESIMD_NS::is_simd_flag_type_v, __ESIMD_NS::simd> lsc_block_load(const T *p, FlagsT flags) { - return __ESIMD_DNS::block_load_impl( + return __ESIMD_DNS::block_load_impl( p, __ESIMD_NS::simd_mask<1>(1), flags); } @@ -992,7 +992,7 @@ lsc_block_load(const T *p, FlagsT flags) { /// @tparam NElts is the number of elements to load per address. /// @tparam DS is the data size (unused/obsolete). /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @param p is the base pointer. /// @param pred is operation predicate. Zero means operation is skipped /// entirely, non-zero - operation is performed. @@ -1002,13 +1002,13 @@ lsc_block_load(const T *p, FlagsT flags) { /// @return is a vector of type T and size NElts. /// template __ESIMD_API std::enable_if_t<__ESIMD_NS::is_simd_flag_type_v, __ESIMD_NS::simd> lsc_block_load(const T *p, __ESIMD_NS::simd_mask<1> pred, __ESIMD_NS::simd pass_thru, FlagsT flags = FlagsT{}) { - return __ESIMD_DNS::block_load_impl(p, pred, pass_thru, + return __ESIMD_DNS::block_load_impl(p, pred, pass_thru, flags); } @@ -1032,7 +1032,7 @@ lsc_block_load(const T *p, __ESIMD_NS::simd_mask<1> pred, /// @tparam NElts is the number of elements to load per address. /// @tparam DS is the data size (unused/obsolete). /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam AccessorTy is the \ref sycl::accessor type. /// @param acc is the SYCL accessor. /// @param offset is the zero-based offset in bytes. @@ -1044,7 +1044,7 @@ lsc_block_load(const T *p, __ESIMD_NS::simd_mask<1> pred, /// vector for which the corresponding element in \p pred is 0 are undefined. /// template __ESIMD_API std::enable_if_t< @@ -1054,12 +1054,12 @@ __ESIMD_API std::enable_if_t< __ESIMD_NS::simd> lsc_block_load(AccessorTy acc, __ESIMD_DNS::DeviceAccessorOffsetT offset, __ESIMD_NS::simd_mask<1> pred = 1, FlagsT flags = FlagsT{}) { - return __ESIMD_DNS::block_load_impl(acc, offset, pred, + return __ESIMD_DNS::block_load_impl(acc, offset, pred, flags); } template __ESIMD_API std::enable_if_t< @@ -1092,7 +1092,7 @@ lsc_block_load(AccessorTy acc, uint32_t offset, /// @tparam NElts is the number of elements to load per address. /// @tparam DS is the data size (unused/obsolete). /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam AccessorTy is the \ref sycl::accessor type. /// @param acc is the SYCL accessor. /// @param offset is the zero-based offset in bytes. @@ -1101,7 +1101,7 @@ lsc_block_load(AccessorTy acc, uint32_t offset, /// vector for which the corresponding element in \p pred is 0 are undefined. /// template __ESIMD_API std::enable_if_t< @@ -1111,12 +1111,12 @@ __ESIMD_API std::enable_if_t< __ESIMD_NS::simd> lsc_block_load(AccessorTy acc, __ESIMD_DNS::DeviceAccessorOffsetT offset, FlagsT flags) { - return lsc_block_load( + return lsc_block_load( acc, offset, __ESIMD_NS::simd_mask<1>(1), flags); } template __ESIMD_API std::enable_if_t< @@ -1125,7 +1125,7 @@ __ESIMD_API std::enable_if_t< __ESIMD_NS::is_simd_flag_type_v, __ESIMD_NS::simd> lsc_block_load(AccessorTy acc, uint32_t offset, FlagsT flags) { - return lsc_block_load( + return lsc_block_load( acc, offset, __ESIMD_NS::simd_mask<1>(1), flags); } @@ -1149,7 +1149,7 @@ lsc_block_load(AccessorTy acc, uint32_t offset, FlagsT flags) { /// @tparam NElts is the number of elements to load per address. /// @tparam DS is the data size (unused/obsolete). /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam AccessorTy is the \ref sycl::accessor type. /// @param acc is the SYCL accessor. /// @param offset is the zero-based offset in bytes. @@ -1162,7 +1162,7 @@ lsc_block_load(AccessorTy acc, uint32_t offset, FlagsT flags) { /// @return is a vector of type T and size NElts /// template __ESIMD_API std::enable_if_t< @@ -1173,12 +1173,12 @@ __ESIMD_API std::enable_if_t< lsc_block_load(AccessorTy acc, __ESIMD_DNS::DeviceAccessorOffsetT offset, __ESIMD_NS::simd_mask<1> pred, __ESIMD_NS::simd pass_thru, FlagsT flags = FlagsT{}) { - return __ESIMD_DNS::block_load_impl(acc, offset, pred, + return __ESIMD_DNS::block_load_impl(acc, offset, pred, pass_thru, flags); } template __ESIMD_API std::enable_if_t< @@ -1202,7 +1202,7 @@ lsc_block_load(AccessorTy acc, uint32_t offset, __ESIMD_NS::simd_mask<1> pred, /// @tparam NElts is the number of elements to load per address. /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam N is the number of channels (platform dependent). /// @param p is the base pointer. /// @param offsets is the zero-based offsets in bytes. @@ -1210,30 +1210,30 @@ lsc_block_load(AccessorTy acc, uint32_t offset, __ESIMD_NS::simd_mask<1> pred, /// template __ESIMD_API void lsc_prefetch(const T *p, __ESIMD_NS::simd offsets, __ESIMD_NS::simd_mask pred = 1) { - __ESIMD_DNS::prefetch_impl(p, offsets, pred); + __ESIMD_DNS::prefetch_impl(p, offsets, pred); } template __ESIMD_API void lsc_prefetch(const T *p, __ESIMD_NS::simd_view offsets, __ESIMD_NS::simd_mask pred = 1) { - lsc_prefetch(p, offsets.read(), pred); + lsc_prefetch(p, offsets.read(), pred); } template __ESIMD_API std::enable_if_t> lsc_prefetch(const T *p, Toffset offset, __ESIMD_NS::simd_mask pred = 1) { - lsc_prefetch( + lsc_prefetch( p, __ESIMD_NS::simd(offset), pred); } @@ -1247,15 +1247,15 @@ lsc_prefetch(const T *p, Toffset offset, __ESIMD_NS::simd_mask pred = 1) { /// @tparam NElts is the number of elements to load per address. /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @param p is the base pointer. /// template + cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none> __ESIMD_API void lsc_prefetch(const T *p) { __ESIMD_NS::simd_mask<1> Mask = 1; - __ESIMD_DNS::prefetch_impl(p, 0, Mask); + __ESIMD_DNS::prefetch_impl(p, 0, Mask); } /// Accessor-based prefetch gather. @@ -1268,7 +1268,7 @@ __ESIMD_API void lsc_prefetch(const T *p) { /// @tparam NElts is the number of elements to load per address. /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam N is the number of channels (platform dependent). /// @tparam AccessorTy is the \ref sycl::accessor type. /// @param acc is the SYCL accessor. @@ -1277,7 +1277,7 @@ __ESIMD_API void lsc_prefetch(const T *p) { /// template __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_device_accessor_with_v< AccessorTy, __ESIMD_DNS::accessor_mode_cap::can_read>> @@ -1289,12 +1289,12 @@ lsc_prefetch(AccessorTy acc, #endif __ESIMD_NS::simd_mask pred = 1) { #ifdef __ESIMD_FORCE_STATELESS_MEM - return lsc_prefetch( + return lsc_prefetch( __ESIMD_DNS::accessorToPointer(acc), offsets, pred); #else detail::check_lsc_vector_size(); detail::check_lsc_data_size(); - detail::check_lsc_cache_hint(); + detail::check_lsc_cache_hint(); constexpr uint16_t _AddressScale = 1; constexpr int _ImmOffset = 0; constexpr lsc_data_size _DS = @@ -1304,7 +1304,7 @@ lsc_prefetch(AccessorTy acc, detail::lsc_data_order::nontranspose; using MsgT = typename detail::lsc_expand_type::type; auto si = __ESIMD_NS::get_surface_index(acc); - __esimd_lsc_prefetch_bti(pred.data(), offsets.data(), si); #endif } @@ -1312,7 +1312,7 @@ lsc_prefetch(AccessorTy acc, #ifdef __ESIMD_FORCE_STATELESS_MEM template __ESIMD_API std::enable_if_t< __ESIMD_DNS::is_device_accessor_with_v< @@ -1320,7 +1320,7 @@ __ESIMD_API std::enable_if_t< std::is_integral_v && !std::is_same_v> lsc_prefetch(AccessorTy acc, __ESIMD_NS::simd offsets, __ESIMD_NS::simd_mask pred = 1) { - lsc_prefetch( + lsc_prefetch( acc, convert(offsets), pred); } #endif @@ -1335,25 +1335,25 @@ lsc_prefetch(AccessorTy acc, __ESIMD_NS::simd offsets, /// @tparam NElts is the number of elements to load per address. /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam AccessorTy is the \ref sycl::accessor type. /// @param acc is the SYCL accessor. /// @param offset is the zero-based offset in bytes. /// template __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_device_accessor_with_v< AccessorTy, __ESIMD_DNS::accessor_mode_cap::can_read>> lsc_prefetch(AccessorTy acc, __ESIMD_DNS::DeviceAccessorOffsetT offset) { #ifdef __ESIMD_FORCE_STATELESS_MEM - lsc_prefetch( + lsc_prefetch( __ESIMD_DNS::accessorToPointer(acc, offset)); #else detail::check_lsc_vector_size(); detail::check_lsc_data_size(); - detail::check_lsc_cache_hint(); + detail::check_lsc_cache_hint(); constexpr uint16_t _AddressScale = 1; constexpr int _ImmOffset = 0; constexpr lsc_data_size _DS = detail::finalize_data_size(); @@ -1367,7 +1367,7 @@ lsc_prefetch(AccessorTy acc, __ESIMD_DNS::DeviceAccessorOffsetT offset) { __ESIMD_NS::simd_mask pred = 1; __ESIMD_NS::simd offsets = offset; auto si = __ESIMD_NS::get_surface_index(acc); - __esimd_lsc_prefetch_bti(pred.data(), offsets.data(), si); #endif } @@ -1426,7 +1426,7 @@ __ESIMD_API void lsc_slm_block_store(uint32_t offset, /// @tparam NElts is the number of elements to store per address. /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam N is the number of channels (platform dependent). /// @param p is the base pointer. /// @param offsets is the zero-based offsets in bytes. @@ -1435,34 +1435,34 @@ __ESIMD_API void lsc_slm_block_store(uint32_t offset, /// template __ESIMD_API void lsc_scatter(T *p, __ESIMD_NS::simd offsets, __ESIMD_NS::simd vals, __ESIMD_NS::simd_mask pred = 1) { - __ESIMD_DNS::scatter_impl(p, offsets, + __ESIMD_DNS::scatter_impl(p, offsets, vals, pred); } template __ESIMD_API void lsc_scatter(T *p, __ESIMD_NS::simd_view offsets, __ESIMD_NS::simd vals, __ESIMD_NS::simd_mask pred = 1) { - lsc_scatter(p, offsets.read(), vals, pred); + lsc_scatter(p, offsets.read(), vals, pred); } template __ESIMD_API std::enable_if_t && N == 1> lsc_scatter(T *p, Toffset offset, __ESIMD_NS::simd vals, __ESIMD_NS::simd_mask pred = 1) { - lsc_scatter( + lsc_scatter( p, __ESIMD_NS::simd(offset), vals, pred); } @@ -1476,7 +1476,7 @@ lsc_scatter(T *p, Toffset offset, __ESIMD_NS::simd vals, /// @tparam NElts is the number of elements to store per address. /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam N is the number of channels (platform dependent). /// @tparam AccessorTy is the \ref sycl::accessor type. /// @param acc is the SYCL accessor. @@ -1486,7 +1486,7 @@ lsc_scatter(T *p, Toffset offset, __ESIMD_NS::simd vals, /// template __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_device_accessor_with_v< AccessorTy, __ESIMD_DNS::accessor_mode_cap::can_write>> @@ -1495,17 +1495,17 @@ lsc_scatter(AccessorTy acc, __ESIMD_NS::simd vals, __ESIMD_NS::simd_mask pred = 1) { #ifdef __ESIMD_FORCE_STATELESS_MEM - lsc_scatter(__ESIMD_DNS::accessorToPointer(acc), + lsc_scatter(__ESIMD_DNS::accessorToPointer(acc), offsets, vals, pred); #else - __ESIMD_DNS::scatter_impl(acc, offsets, vals, pred); + __ESIMD_DNS::scatter_impl(acc, offsets, vals, pred); #endif } #ifdef __ESIMD_FORCE_STATELESS_MEM template __ESIMD_API std::enable_if_t< __ESIMD_DNS::is_device_accessor_with_v< @@ -1514,14 +1514,14 @@ __ESIMD_API std::enable_if_t< lsc_scatter(AccessorTy acc, __ESIMD_NS::simd offsets, __ESIMD_NS::simd vals, __ESIMD_NS::simd_mask pred = 1) { - lsc_scatter( + lsc_scatter( acc, convert(offsets), vals, pred); } #endif template __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_local_accessor_with_v< AccessorTy, __ESIMD_DNS::accessor_mode_cap::can_write>> @@ -1556,7 +1556,7 @@ lsc_scatter(AccessorTy acc, __ESIMD_NS::simd offsets, /// @tparam NElts is the number of elements to store per address. /// @tparam DS is the data size (unused/obsolete). /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @param p is the base pointer. /// @param vals is values to store. /// @param pred is operation predicate. Zero means operation is skipped @@ -1565,12 +1565,12 @@ lsc_scatter(AccessorTy acc, __ESIMD_NS::simd offsets, /// @param flags is the alignment specifier type tag. /// template __ESIMD_API std::enable_if_t<__ESIMD_NS::is_simd_flag_type_v> lsc_block_store(T *p, __ESIMD_NS::simd vals, __ESIMD_NS::simd_mask<1> pred = 1, FlagsT flags = FlagsT{}) { - return __ESIMD_DNS::block_store_impl(p, vals, pred, + return __ESIMD_DNS::block_store_impl(p, vals, pred, flags); } @@ -1597,17 +1597,17 @@ lsc_block_store(T *p, __ESIMD_NS::simd vals, /// @tparam NElts is the number of elements to store per address. /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @param p is the base pointer. /// @param vals is values to store. /// @param flags is the alignment specifier type tag. /// template __ESIMD_API std::enable_if_t<__ESIMD_NS::is_simd_flag_type_v> lsc_block_store(T *p, __ESIMD_NS::simd vals, FlagsT flags) { - lsc_block_store(p, vals, __ESIMD_NS::simd_mask<1>(1), + lsc_block_store(p, vals, __ESIMD_NS::simd_mask<1>(1), flags); } @@ -1635,7 +1635,7 @@ lsc_block_store(T *p, __ESIMD_NS::simd vals, FlagsT flags) { /// @tparam NElts is the number of elements to store per address. /// @tparam DS is the data size (unused/obsolete). /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam AccessorTy is the \ref sycl::accessor type. /// @param acc is the SYCL accessor. /// @param offset is the zero-based offset in bytes. @@ -1646,7 +1646,7 @@ lsc_block_store(T *p, __ESIMD_NS::simd vals, FlagsT flags) { /// @param flags is the alignment specifier type tag. /// template __ESIMD_API std::enable_if_t< @@ -1656,12 +1656,12 @@ __ESIMD_API std::enable_if_t< lsc_block_store(AccessorTy acc, __ESIMD_DNS::DeviceAccessorOffsetT offset, __ESIMD_NS::simd vals, __ESIMD_NS::simd_mask<1> pred = 1, FlagsT flags = FlagsT{}) { - __ESIMD_DNS::block_store_impl(acc, offset, vals, pred, + __ESIMD_DNS::block_store_impl(acc, offset, vals, pred, flags); } template __ESIMD_API std::enable_if_t< @@ -1697,7 +1697,7 @@ lsc_block_store(AccessorTy acc, uint32_t offset, /// @tparam NElts is the number of elements to store per address. /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam AccessorTy is the \ref sycl::accessor type. /// @param acc is the SYCL accessor. /// @param offset is the zero-based offset in bytes. @@ -1705,7 +1705,7 @@ lsc_block_store(AccessorTy acc, uint32_t offset, /// @param flags is the alignment specifier type tag. /// template __ESIMD_API std::enable_if_t< @@ -1714,7 +1714,7 @@ __ESIMD_API std::enable_if_t< __ESIMD_NS::is_simd_flag_type_v> lsc_block_store(AccessorTy acc, __ESIMD_DNS::DeviceAccessorOffsetT offset, __ESIMD_NS::simd vals, FlagsT flags) { - lsc_block_store(acc, offset, vals, + lsc_block_store(acc, offset, vals, __ESIMD_NS::simd_mask<1>(1), flags); } @@ -1804,7 +1804,7 @@ constexpr void check_lsc_block_2d_restrictions() { /// @tparam Transposed is the transposed version or not. /// @tparam Transformed is apply VNNI transform or not. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam N is the data size /// @param Ptr is the surface base address for this operation. /// @param SurfaceWidth is the surface width minus 1 in bytes @@ -1822,14 +1822,14 @@ constexpr void check_lsc_block_2d_restrictions() { /// template ()> __ESIMD_API __ESIMD_NS::simd lsc_load_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight, unsigned SurfacePitch, int X, int Y) { using RawT = __ESIMD_DNS::__raw_t; - detail::check_lsc_cache_hint(); + detail::check_lsc_cache_hint(); detail::check_lsc_block_2d_restrictions(); @@ -1866,7 +1866,7 @@ lsc_load_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight, Transposed ? detail::lsc_data_order::transpose : detail::lsc_data_order::nontranspose; __ESIMD_NS::simd Raw = - __esimd_lsc_load2d_stateless(pred.data(), surf_addr, SurfaceWidth, SurfaceHeight, @@ -1920,7 +1920,7 @@ lsc_load_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight, /// @tparam BlockHeight is the block height in number of elements. /// @tparam NBlocks is the number of blocks. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam N is the data size /// @param Ptr is the surface base address for this operation. /// @param SurfaceWidth is the surface width minus 1 in bytes @@ -1932,13 +1932,13 @@ lsc_load_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight, /// rows. /// template ()> __ESIMD_API void lsc_prefetch_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight, unsigned SurfacePitch, int X, int Y) { - detail::check_lsc_cache_hint(); + detail::check_lsc_cache_hint(); detail::check_lsc_block_2d_restrictions(); @@ -1948,7 +1948,7 @@ __ESIMD_API void lsc_prefetch_2d(const T *Ptr, unsigned SurfaceWidth, uintptr_t surf_addr = reinterpret_cast(Ptr); constexpr detail::lsc_data_order _Transposed = detail::lsc_data_order::nontranspose; - __esimd_lsc_prefetch2d_stateless( pred.data(), surf_addr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y); } @@ -1963,7 +1963,7 @@ __ESIMD_API void lsc_prefetch_2d(const T *Ptr, unsigned SurfaceWidth, /// @tparam BlockWidth is the block width in number of elements. /// @tparam BlockHeight is the block height in number of elements. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam N is the data size /// @param Ptr is the surface base address for this operation. /// @param SurfaceWidth is the surface width minus 1 in bytes @@ -1978,14 +1978,14 @@ __ESIMD_API void lsc_prefetch_2d(const T *Ptr, unsigned SurfaceWidth, /// getNextPowerOf2(BlockWidth) * NBlocks /// template ()> __ESIMD_API void lsc_store_2d(T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight, unsigned SurfacePitch, int X, int Y, __ESIMD_NS::simd Vals) { using RawT = __ESIMD_DNS::__raw_t; - detail::check_lsc_cache_hint(); + detail::check_lsc_cache_hint(); detail::check_lsc_block_2d_restrictions(); @@ -2009,7 +2009,7 @@ __ESIMD_API void lsc_store_2d(T *Ptr, unsigned SurfaceWidth, } __ESIMD_NS::simd_mask pred = 1; - __esimd_lsc_store2d_stateless( pred.data(), surf_addr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y, Raw.data()); @@ -2202,19 +2202,19 @@ class config_2d_mem_access { __ESIMD_NS::simd payload_data; template friend ESIMD_INLINE SYCL_ESIMD_FUNCTION __ESIMD_NS::simd lsc_load_2d( config_2d_mem_access &payload); template + cache_hint L1H, cache_hint L2H, int N> friend ESIMD_INLINE SYCL_ESIMD_FUNCTION void lsc_store_2d( config_2d_mem_access &payload, __ESIMD_NS::simd Data); template friend ESIMD_INLINE SYCL_ESIMD_FUNCTION void lsc_prefetch_2d( config_2d_mem_access &payload); @@ -2232,7 +2232,7 @@ class config_2d_mem_access { /// @tparam Transposed is the transposed version or not. /// @tparam Transformed is apply VNNI transform or not. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam N is the data size /// @param payload is \c config_2d_mem_access \c object holding all the data /// @return is a vector of type T and size N, where N is @@ -2241,7 +2241,7 @@ class config_2d_mem_access { /// template ()> ESIMD_INLINE SYCL_ESIMD_FUNCTION __ESIMD_NS::simd lsc_load_2d( @@ -2249,7 +2249,7 @@ ESIMD_INLINE SYCL_ESIMD_FUNCTION __ESIMD_NS::simd lsc_load_2d( detail::check_lsc_block_2d_restrictions(); - detail::check_lsc_cache_hint(); + detail::check_lsc_cache_hint(); constexpr int ElemsPerDword = 4 / sizeof(T); constexpr int GRFRowSize = Transposed ? BlockHeight : Transformed ? BlockWidth * ElemsPerDword @@ -2277,7 +2277,7 @@ ESIMD_INLINE SYCL_ESIMD_FUNCTION __ESIMD_NS::simd lsc_load_2d( static_assert(N == ActualN || N == DstElements, "Incorrect element count"); - constexpr uint32_t cache_mask = detail::get_lsc_load_cache_mask() + constexpr uint32_t cache_mask = detail::get_lsc_load_cache_mask() << 17; constexpr uint32_t base_desc = 0x2000003; constexpr uint32_t transformMask = Transformed ? 1 << 7 : 0; @@ -2329,24 +2329,24 @@ ESIMD_INLINE SYCL_ESIMD_FUNCTION __ESIMD_NS::simd lsc_load_2d( /// @tparam Transposed is the transposed version or not. /// @tparam Transformed is apply VNNI transform or not. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam N is the data size /// @param payload is \c config_2d_mem_access \c object holding all the data /// template ()> ESIMD_INLINE SYCL_ESIMD_FUNCTION void lsc_prefetch_2d( config_2d_mem_access &payload) { - detail::check_lsc_cache_hint(); + detail::check_lsc_cache_hint(); detail::check_lsc_block_2d_restrictions(); static_assert(!Transposed || !Transformed, "Transposed and transformed is not supported"); - constexpr uint32_t cache_mask = detail::get_lsc_load_cache_mask() + constexpr uint32_t cache_mask = detail::get_lsc_load_cache_mask() << 17; constexpr uint32_t dataSizeMask = detail::get_lsc_data_size() << 9; constexpr uint32_t base_desc = 0x2000003; @@ -2372,13 +2372,13 @@ ESIMD_INLINE SYCL_ESIMD_FUNCTION void lsc_prefetch_2d( /// @tparam BlockHeight block height in number of elements /// @tparam NBlocks Number of blocks /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam N is the data size /// @param payload is \c config_2d_mem_access \c object holding all the data /// @param Data is the data to be stored. /// template ()> ESIMD_INLINE SYCL_ESIMD_FUNCTION void @@ -2387,9 +2387,9 @@ lsc_store_2d(config_2d_mem_access &payload, detail::check_lsc_block_2d_restrictions(); - detail::check_lsc_cache_hint(); + detail::check_lsc_cache_hint(); - constexpr uint32_t cache_mask = detail::get_lsc_store_cache_mask() + constexpr uint32_t cache_mask = detail::get_lsc_store_cache_mask() << 17; constexpr uint32_t dataSizeMask = detail::get_lsc_data_size() << 9; constexpr uint32_t base_desc = 0x2000007; @@ -2498,32 +2498,32 @@ lsc_slm_atomic_update(__ESIMD_NS::simd offsets, /// @tparam N is the number of channels (platform dependent). /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @param p is the base pointer. /// @param offsets is the zero-based offsets. /// @param pred is predicates. /// template <__ESIMD_NS::atomic_op Op, typename T, int N, lsc_data_size DS = lsc_data_size::default_size, - cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, + cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, typename Toffset> __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 0, __ESIMD_NS::simd> lsc_atomic_update(T *p, __ESIMD_NS::simd offsets, __ESIMD_NS::simd_mask pred) { - return __ESIMD_DNS::atomic_update_impl( + return __ESIMD_DNS::atomic_update_impl( p, offsets, pred); } template <__ESIMD_NS::atomic_op Op, typename T, int N, lsc_data_size DS = lsc_data_size::default_size, - cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, + cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, typename Toffset> __ESIMD_API std::enable_if_t && __ESIMD_DNS::get_num_args() == 0, __ESIMD_NS::simd> lsc_atomic_update(T *p, Toffset offset, __ESIMD_NS::simd_mask pred = 1) { - return lsc_atomic_update( + return lsc_atomic_update( p, __ESIMD_NS::simd(offset), pred); } @@ -2536,7 +2536,7 @@ lsc_atomic_update(T *p, Toffset offset, __ESIMD_NS::simd_mask pred = 1) { /// @tparam N is the number of channels (platform dependent). /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @param p is the base pointer. /// @param offsets is the zero-based offsets. /// @param src0 is the first atomic operand. @@ -2544,32 +2544,32 @@ lsc_atomic_update(T *p, Toffset offset, __ESIMD_NS::simd_mask pred = 1) { /// template <__ESIMD_NS::atomic_op Op, typename T, int N, lsc_data_size DS = lsc_data_size::default_size, - cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, + cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, typename Toffset> __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 1, __ESIMD_NS::simd> lsc_atomic_update(T *p, __ESIMD_NS::simd offsets, __ESIMD_NS::simd src0, __ESIMD_NS::simd_mask pred) { - return __ESIMD_DNS::atomic_update_impl( + return __ESIMD_DNS::atomic_update_impl( p, offsets, src0, pred); } template <__ESIMD_NS::atomic_op Op, typename T, int N, lsc_data_size DS = lsc_data_size::default_size, - cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, + cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, typename OffsetObjT, typename RegionTy> __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 1, __ESIMD_NS::simd> lsc_atomic_update(T *p, __ESIMD_NS::simd_view offsets, __ESIMD_NS::simd src0, __ESIMD_NS::simd_mask pred = 1) { - return lsc_atomic_update(p, offsets.read(), src0, + return lsc_atomic_update(p, offsets.read(), src0, pred); } template <__ESIMD_NS::atomic_op Op, typename T, int N, lsc_data_size DS = lsc_data_size::default_size, - cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, + cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, typename Toffset> __ESIMD_API std::enable_if_t && __ESIMD_DNS::get_num_args() == 1 && @@ -2579,7 +2579,7 @@ __ESIMD_API std::enable_if_t && __ESIMD_NS::simd> lsc_atomic_update(T *p, Toffset offset, __ESIMD_NS::simd src0, __ESIMD_NS::simd_mask pred = 1) { - return lsc_atomic_update( + return lsc_atomic_update( p, __ESIMD_NS::simd(offset), src0, pred); } @@ -2592,7 +2592,7 @@ lsc_atomic_update(T *p, Toffset offset, __ESIMD_NS::simd src0, /// @tparam N is the number of channels (platform dependent). /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @param p is the base pointer. /// @param offsets is the zero-based offsets. /// @param src0 is the first atomic operand (expected value). @@ -2601,33 +2601,33 @@ lsc_atomic_update(T *p, Toffset offset, __ESIMD_NS::simd src0, /// template <__ESIMD_NS::atomic_op Op, typename T, int N, lsc_data_size DS = lsc_data_size::default_size, - cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, + cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, typename Toffset> __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 2, __ESIMD_NS::simd> lsc_atomic_update(T *p, __ESIMD_NS::simd offsets, __ESIMD_NS::simd src0, __ESIMD_NS::simd src1, __ESIMD_NS::simd_mask pred) { - return __ESIMD_DNS::atomic_update_impl( + return __ESIMD_DNS::atomic_update_impl( p, offsets, src0, src1, pred); } template <__ESIMD_NS::atomic_op Op, typename T, int N, lsc_data_size DS = lsc_data_size::default_size, - cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, + cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, typename OffsetObjT, typename RegionTy> __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 2, __ESIMD_NS::simd> lsc_atomic_update(T *p, __ESIMD_NS::simd_view offsets, __ESIMD_NS::simd src0, __ESIMD_NS::simd src1, __ESIMD_NS::simd_mask pred = 1) { - return lsc_atomic_update(p, offsets.read(), src0, + return lsc_atomic_update(p, offsets.read(), src0, src1, pred); } template <__ESIMD_NS::atomic_op Op, typename T, int N, lsc_data_size DS = lsc_data_size::default_size, - cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, + cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, typename Toffset> __ESIMD_API std::enable_if_t && __ESIMD_DNS::get_num_args() == 2, @@ -2635,7 +2635,7 @@ __ESIMD_API std::enable_if_t && lsc_atomic_update(T *p, Toffset offset, __ESIMD_NS::simd src0, __ESIMD_NS::simd src1, __ESIMD_NS::simd_mask pred = 1) { - return lsc_atomic_update( + return lsc_atomic_update( p, __ESIMD_NS::simd(offset), src0, src1, pred); } @@ -2648,7 +2648,7 @@ lsc_atomic_update(T *p, Toffset offset, __ESIMD_NS::simd src0, /// @tparam N is the number of channels (platform dependent). /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam AccessorTy is the \ref sycl::accessor type. /// @param acc is the SYCL accessor. /// @param offsets is the zero-based offsets. @@ -2658,7 +2658,7 @@ lsc_atomic_update(T *p, Toffset offset, __ESIMD_NS::simd src0, /// update. template <__ESIMD_NS::atomic_op Op, typename T, int N, lsc_data_size DS = lsc_data_size::default_size, - cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, + cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, typename AccessorTy, typename Toffset> __ESIMD_API std::enable_if_t< __ESIMD_DNS::is_device_accessor_with_v< @@ -2669,7 +2669,7 @@ __ESIMD_API std::enable_if_t< __ESIMD_NS::simd> lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, __ESIMD_NS::simd_mask pred) { - return __ESIMD_DNS::atomic_update_impl(acc, offsets, + return __ESIMD_DNS::atomic_update_impl(acc, offsets, pred); } @@ -2680,7 +2680,7 @@ lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, /// @tparam N is the number of channels (platform dependent). /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam AccessorTy is the \ref sycl::accessor type. /// @param acc is the SYCL accessor. /// @param offsets is the zero-based offsets. @@ -2690,7 +2690,7 @@ lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, /// update. template <__ESIMD_NS::atomic_op Op, typename T, int N, lsc_data_size DS = lsc_data_size::default_size, - cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, + cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, typename AccessorTy> __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_rw_local_accessor_v, __ESIMD_NS::simd> @@ -2709,7 +2709,7 @@ lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, /// @tparam N is the number of channels (platform dependent). /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam AccessorTy is the \ref sycl::accessor type. /// @param acc is the SYCL accessor. /// @param offsets is the zero-based offsets. @@ -2720,13 +2720,13 @@ lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, /// update. template <__ESIMD_NS::atomic_op Op, typename T, int N, lsc_data_size DS = lsc_data_size::default_size, - cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, + cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, typename AccessorTy, typename Toffset> __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_rw_device_accessor_v, __ESIMD_NS::simd> lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, __ESIMD_NS::simd src0, __ESIMD_NS::simd_mask pred) { - return __ESIMD_DNS::atomic_update_impl(acc, offsets, + return __ESIMD_DNS::atomic_update_impl(acc, offsets, src0, pred); } @@ -2737,7 +2737,7 @@ lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, /// @tparam N is the number of channels (platform dependent). /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam AccessorTy is the \ref sycl::accessor type. /// @param acc is the SYCL accessor. /// @param offsets is the zero-based offsets. @@ -2748,7 +2748,7 @@ lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, /// update. template <__ESIMD_NS::atomic_op Op, typename T, int N, lsc_data_size DS = lsc_data_size::default_size, - cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, + cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, typename AccessorTy> __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_rw_local_accessor_v, __ESIMD_NS::simd> @@ -2767,7 +2767,7 @@ lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, /// @tparam N is the number of channels (platform dependent). /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam AccessorTy is the \ref sycl::accessor type. /// @param acc is the SYCL accessor. /// @param offsets is the zero-based offsets. @@ -2779,7 +2779,7 @@ lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, /// update. template <__ESIMD_NS::atomic_op Op, typename T, int N, lsc_data_size DS = lsc_data_size::default_size, - cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, + cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, typename AccessorTy, typename Toffset> __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_rw_device_accessor_v, __ESIMD_NS::simd> @@ -2787,7 +2787,7 @@ lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, __ESIMD_NS::simd src0, __ESIMD_NS::simd src1, __ESIMD_NS::simd_mask pred) { #ifdef __ESIMD_FORCE_STATELESS_MEM - return lsc_atomic_update( + return lsc_atomic_update( __ESIMD_DNS::accessorToPointer(acc), offsets, src0, src1, pred); #else static_assert(std::is_integral_v && sizeof(Toffset) == 4, @@ -2795,7 +2795,7 @@ lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, detail::check_lsc_vector_size<1>(); detail::check_lsc_data_size(); __ESIMD_DNS::check_atomic(); - detail::check_lsc_cache_hint(); + detail::check_lsc_cache_hint(); constexpr uint16_t _AddressScale = 1; constexpr int _ImmOffset = 0; constexpr lsc_data_size _DS = @@ -2809,7 +2809,7 @@ lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, __ESIMD_NS::simd Msg_data1 = detail::lsc_format_input(src1); auto si = __ESIMD_NS::get_surface_index(acc); __ESIMD_NS::simd Tmp = - __esimd_lsc_xatomic_bti_2( pred.data(), offsets.data(), Msg_data0.data(), Msg_data1.data(), si); return detail::lsc_format_ret(Tmp); @@ -2823,7 +2823,7 @@ lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, /// @tparam N is the number of channels (platform dependent). /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam AccessorTy is the \ref sycl::accessor type. /// @param acc is the SYCL accessor. /// @param offsets is the zero-based offsets. @@ -2835,7 +2835,7 @@ lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, /// update. template <__ESIMD_NS::atomic_op Op, typename T, int N, lsc_data_size DS = lsc_data_size::default_size, - cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, + cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, typename AccessorTy> __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_rw_local_accessor_v, __ESIMD_NS::simd> diff --git a/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_load_prefetch_2d.hpp b/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_load_prefetch_2d.hpp index 5857bce003f89..42c437700c0c1 100644 --- a/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_load_prefetch_2d.hpp +++ b/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_load_prefetch_2d.hpp @@ -20,7 +20,7 @@ using namespace sycl::ext::intel::experimental::esimd; template bool test(unsigned SurfaceWidth, unsigned SurfaceHeight, unsigned SurfacePitch, int X, int Y) { @@ -90,14 +90,14 @@ bool test(unsigned SurfaceWidth, unsigned SurfaceHeight, unsigned SurfacePitch, simd vals; if constexpr (use_prefetch) { - lsc_prefetch_2d( + lsc_prefetch_2d( in + off, width, height, pitch, X, Y); vals = lsc_load_2d(in + off, width, height, pitch, X, Y); } else { vals = lsc_load_2d( + Transposed, Transformed, L1H, L2H>( in + off, width, height, pitch, X, Y); } diff --git a/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_store_2d.hpp b/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_store_2d.hpp index 3264a16ffc2df..ee553fd12c33b 100644 --- a/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_store_2d.hpp +++ b/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_store_2d.hpp @@ -53,7 +53,7 @@ template (), - cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none> + cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none> bool test(unsigned SurfaceWidth, unsigned SurfaceHeight, unsigned SurfacePitch, int X, int Y) { @@ -89,7 +89,7 @@ bool test(unsigned SurfaceWidth, unsigned SurfaceHeight, unsigned SurfacePitch, simd vals(new_val + off, 1); // IUT - lsc_store_2d( + lsc_store_2d( out + off, SurfaceWidth * sizeof(T) - 1, SurfaceHeight - 1, SurfacePitch * sizeof(T) - 1, X, Y, vals); }); diff --git a/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_surf_load.hpp b/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_surf_load.hpp index 97ad1a68dfadd..54359e458793a 100644 --- a/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_surf_load.hpp +++ b/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_surf_load.hpp @@ -20,7 +20,7 @@ using namespace sycl::ext::intel::experimental::esimd; template < int case_num, typename T, uint32_t Groups, uint32_t Threads, uint16_t VL, uint16_t VS, bool transpose, lsc_data_size DS = lsc_data_size::default_size, - cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, + cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, bool use_prefetch = false, typename Flags = __ESIMD_NS::overaligned_tag<4>> bool test(uint32_t pmask = 0xffffffff) { static_assert((VL == 1) || !transpose, "Transpose must have exec size 1"); @@ -87,19 +87,19 @@ bool test(uint32_t pmask = 0xffffffff) { if constexpr (transpose) { simd vals; if constexpr (use_prefetch) { - lsc_prefetch(acci, byte_off); + lsc_prefetch(acci, byte_off); if constexpr (sizeof(T) < 8) { - vals = lsc_block_load(acci, byte_off, + vals = lsc_block_load(acci, byte_off, Flags{}); } else { - vals = lsc_block_load(acci, byte_off); + vals = lsc_block_load(acci, byte_off); } } else { if constexpr (sizeof(T) < 8) { - vals = lsc_block_load(acci, byte_off, + vals = lsc_block_load(acci, byte_off, Flags{}); } else { - vals = lsc_block_load(acci, byte_off); + vals = lsc_block_load(acci, byte_off); } } if constexpr (sizeof(T) < 8) { @@ -115,11 +115,11 @@ bool test(uint32_t pmask = 0xffffffff) { simd vals; if constexpr (use_prefetch) { - lsc_prefetch(acci, offset, pred); + lsc_prefetch(acci, offset, pred); vals = lsc_gather(acci, offset, pred); } else { - vals = lsc_gather(acci, offset, pred); + vals = lsc_gather(acci, offset, pred); } if constexpr (DS == lsc_data_size::u8u32 || diff --git a/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_surf_store.hpp b/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_surf_store.hpp index 0331db4ba8a73..ea118012f07f5 100644 --- a/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_surf_store.hpp +++ b/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_surf_store.hpp @@ -20,7 +20,7 @@ using namespace sycl::ext::intel::experimental::esimd; template > bool test(uint32_t pmask = 0xffffffff) { static_assert((VL == 1) || !transpose, "Transpose must have exec size 1"); @@ -82,10 +82,10 @@ bool test(uint32_t pmask = 0xffffffff) { if constexpr (transpose) { simd vals(new_val + elem_off, 1); if constexpr (sizeof(T) < 8) { - lsc_block_store(acco, byte_off, vals, + lsc_block_store(acco, byte_off, vals, Flags{}); } else { - lsc_block_store(acco, byte_off, vals); + lsc_block_store(acco, byte_off, vals); } } else { simd offset(byte_off, VS * sizeof(T)); @@ -99,7 +99,7 @@ bool test(uint32_t pmask = 0xffffffff) { for (int j = 0; j < VS; j++) vals.template select<1, 1>(i + j * VL) = val++; - lsc_scatter(acco, offset, vals, pred); + lsc_scatter(acco, offset, vals, pred); } }); }); diff --git a/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_usm_block_load_prefetch.hpp b/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_usm_block_load_prefetch.hpp index bab1dc2991894..f82e0d0dc52c5 100644 --- a/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_usm_block_load_prefetch.hpp +++ b/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_usm_block_load_prefetch.hpp @@ -20,7 +20,7 @@ using namespace sycl::ext::intel::experimental::esimd; template > bool test(queue Q, uint32_t Groups, uint32_t Threads) { @@ -63,7 +63,7 @@ bool test(queue Q, uint32_t Groups, uint32_t Threads) { simd_mask<1> Mask = GlobalID % 1; if constexpr (UsePrefetch) { - lsc_prefetch(In + ElemOffset); + lsc_prefetch(In + ElemOffset); if constexpr (sizeof(T) < 8) { Vals = lsc_block_load(In + ElemOffset, Mask, OldValues, Flags{}); @@ -72,16 +72,16 @@ bool test(queue Q, uint32_t Groups, uint32_t Threads) { } } else { if constexpr (sizeof(T) < 8) { - Vals = lsc_block_load(In + ElemOffset, Mask, + Vals = lsc_block_load(In + ElemOffset, Mask, OldValues, Flags{}); } else { - Vals = lsc_block_load(In + ElemOffset, Mask, + Vals = lsc_block_load(In + ElemOffset, Mask, OldValues); } } } else { if constexpr (UsePrefetch) { - lsc_prefetch(In + ElemOffset); + lsc_prefetch(In + ElemOffset); if constexpr (sizeof(T) < 8) { Vals = lsc_block_load(In + ElemOffset, Flags{}); } else { @@ -90,9 +90,9 @@ bool test(queue Q, uint32_t Groups, uint32_t Threads) { } else { if constexpr (sizeof(T) < 8) { Vals = - lsc_block_load(In + ElemOffset, Flags{}); + lsc_block_load(In + ElemOffset, Flags{}); } else { - Vals = lsc_block_load(In + ElemOffset); + Vals = lsc_block_load(In + ElemOffset); } } } @@ -138,7 +138,7 @@ bool test(queue Q, uint32_t Groups, uint32_t Threads) { template bool test_lsc_block_load() { constexpr lsc_data_size DS = lsc_data_size::default_size; constexpr cache_hint L1H = cache_hint::none; - constexpr cache_hint L3H = cache_hint::none; + constexpr cache_hint L2H = cache_hint::none; constexpr bool NoPrefetch = false; constexpr bool CheckMerge = true; @@ -150,52 +150,52 @@ template bool test_lsc_block_load() { << Q.get_device().get_info() << std::endl; bool Passed = true; - Passed &= test(Q, 1, 4); - Passed &= test(Q, 1, 4); - Passed &= test(Q, 2, 2); - Passed &= test(Q, 2, 8); - Passed &= test(Q, 3, 3); + Passed &= test(Q, 1, 4); + Passed &= test(Q, 1, 4); + Passed &= test(Q, 2, 2); + Passed &= test(Q, 2, 8); + Passed &= test(Q, 3, 3); if constexpr (sizeof(T) * 2 >= sizeof(int)) - Passed &= test(Q, 5, 5); + Passed &= test(Q, 5, 5); if constexpr (sizeof(T) >= sizeof(int)) - Passed &= test(Q, 3, 5); + Passed &= test(Q, 3, 5); if constexpr (sizeof(T) <= 4) { - Passed &= test>(Q, 1, 4); - Passed &= test>(Q, 1, 4); if constexpr (sizeof(T) == 2) { - Passed &= test>(Q, 1, 4); - Passed &= test>(Q, 1, 4); } if constexpr (sizeof(T) == 1) { - Passed &= test>(Q, 1, 4); - Passed &= test>(Q, 1, 4); } } - Passed &= test(Q, 1, 4); - Passed &= test(Q, 2, 2); - Passed &= test(Q, 4, 4); - Passed &= test(Q, 2, 8); - Passed &= test(Q, 3, 3); + Passed &= test(Q, 1, 4); + Passed &= test(Q, 2, 2); + Passed &= test(Q, 4, 4); + Passed &= test(Q, 2, 8); + Passed &= test(Q, 3, 3); if constexpr (sizeof(T) * 2 >= sizeof(int)) - Passed &= test(Q, 5, 5); + Passed &= test(Q, 5, 5); if constexpr (sizeof(T) >= sizeof(int)) - Passed &= test(Q, 3, 5); + Passed &= test(Q, 3, 5); // Only 512-bits maximum can be loaded at once (i.e. 4*128 bytes). if constexpr (sizeof(T) <= 4) - Passed &= test>(Q, 1, 4); if constexpr (sizeof(T) <= 2) - Passed &= test>(Q, 1, 4); if constexpr (sizeof(T) == 1) - Passed &= test>(Q, 1, 4); return Passed; @@ -205,7 +205,7 @@ template std::enable_if_t test_lsc_prefetch() { constexpr cache_hint L1H = cache_hint::cached; - constexpr cache_hint L3H = cache_hint::uncached; + constexpr cache_hint L2H = cache_hint::uncached; constexpr bool DoPrefetch = true; auto Q = queue{gpu_selector_v}; @@ -214,15 +214,15 @@ std::enable_if_t test_lsc_prefetch() { << Q.get_device().get_info() << std::endl; bool Passed = true; - Passed &= test(Q, 1, 4); - Passed &= test(Q, 1, 4); - Passed &= test(Q, 2, 2); - Passed &= test(Q, 2, 8); - Passed &= test(Q, 3, 3); + Passed &= test(Q, 1, 4); + Passed &= test(Q, 1, 4); + Passed &= test(Q, 2, 2); + Passed &= test(Q, 2, 8); + Passed &= test(Q, 3, 3); if constexpr (sizeof(T) * 2 >= sizeof(int)) - Passed &= test(Q, 5, 5); + Passed &= test(Q, 5, 5); if constexpr (sizeof(T) >= sizeof(int)) - Passed &= test(Q, 3, 5); + Passed &= test(Q, 3, 5); return Passed; } diff --git a/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_usm_gather_prefetch.hpp b/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_usm_gather_prefetch.hpp index 21c1991cc7267..117b03d919f96 100644 --- a/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_usm_gather_prefetch.hpp +++ b/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_usm_gather_prefetch.hpp @@ -26,7 +26,7 @@ typedef uint32_t Toffset; template bool test(queue q, uint32_t Groups, uint32_t Threads, uint32_t pmask = 0xffffffff) { @@ -90,15 +90,15 @@ bool test(queue q, uint32_t Groups, uint32_t Threads, simd vals; if constexpr (use_prefetch) { - lsc_prefetch(in, offset, pred); + lsc_prefetch(in, offset, pred); vals = lsc_gather( in, offset, pred); } else if constexpr (!use_old_values) { - vals = lsc_gather(in, offset, pred); + vals = lsc_gather(in, offset, pred); } else { // use_old_values simd old_values = merge_value; vals = - lsc_gather(in, offset, pred, old_values); + lsc_gather(in, offset, pred, old_values); } if constexpr (DS == lsc_data_size::u8u32 || DS == lsc_data_size::u16u32) @@ -158,38 +158,38 @@ template bool test_lsc_gather_prefetch(queue q) { constexpr cache_hint L1H = cache_hint::cached; - constexpr cache_hint L3H = cache_hint::uncached; + constexpr cache_hint L2H = cache_hint::uncached; constexpr bool DoMerging = true; bool Passed = true; Passed &= - test(q, 4, 4, rand()); + test(q, 4, 4, rand()); if constexpr (!DoPrefetch) Passed &= - test(q, 4, 4, rand()); + test(q, 4, 4, rand()); #ifndef USE_SCALAR_OFFSET // These tests use lsc_scatter with scalar offset when USE_SCALAR_OFFSET macro // is set, which is UB and thus guarded by the macro here. - Passed &= test(q, 1, 4, rand()); - Passed &= test(q, 2, 4, rand()); - Passed &= test(q, 2, 2, rand()); - Passed &= test(q, 4, 2, rand()); - Passed &= test(q, 4, 16, rand()); + Passed &= test(q, 1, 4, rand()); + Passed &= test(q, 2, 4, rand()); + Passed &= test(q, 2, 2, rand()); + Passed &= test(q, 4, 2, rand()); + Passed &= test(q, 4, 16, rand()); // The next block of tests is only for gather with merging semantics, // not for prefetch tests. if constexpr (!DoPrefetch) { Passed &= - test(q, 1, 4, rand()); + test(q, 1, 4, rand()); Passed &= - test(q, 4, 16, rand()); + test(q, 4, 16, rand()); } if constexpr (((DS == lsc_data_size::default_size && sizeof(T) >= 4) || DS == lsc_data_size::u32 || DS == lsc_data_size::u32) && !DoPrefetch) { - Passed &= test(q, 2, 4, rand()); + Passed &= test(q, 2, 4, rand()); } #endif // !USE_SCALAR_OFFSET diff --git a/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_usm_store.hpp b/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_usm_store.hpp index 4765152b644c8..e7864d6401835 100644 --- a/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_usm_store.hpp +++ b/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_usm_store.hpp @@ -26,7 +26,7 @@ typedef uint32_t Toffset; template > bool test(uint32_t pmask = 0xffffffff) { static_assert((VL == 1) || !transpose, "Transpose must have exec size 1"); @@ -86,10 +86,10 @@ bool test(uint32_t pmask = 0xffffffff) { if constexpr (transpose) { simd vals(new_val + elem_off, 1); if constexpr (sizeof(T) < 8) { - lsc_block_store(out + elem_off, vals, + lsc_block_store(out + elem_off, vals, Flags{}); } else { - lsc_block_store(out + elem_off, vals); + lsc_block_store(out + elem_off, vals); } } else { simd offset(byte_off, VS * sizeof(T)); @@ -103,7 +103,7 @@ bool test(uint32_t pmask = 0xffffffff) { for (int j = 0; j < VS; j++) vals.template select<1, 1>(i + j * VL) = val++; - lsc_scatter(out, offset, vals, pred); + lsc_scatter(out, offset, vals, pred); } }); }); diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u16.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u16.cpp index 65d5afe54c541..92b60cfe4a4bd 100644 --- a/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u16.cpp +++ b/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u16.cpp @@ -15,30 +15,30 @@ constexpr uint32_t seed = 322; using T = uint16_t; constexpr cache_hint L1H = cache_hint::cached; -constexpr cache_hint L3H = cache_hint::uncached; +constexpr cache_hint L2H = cache_hint::uncached; int main(void) { srand(seed); bool passed = true; // non transposed, non transformed - passed &= test<1, T, 1, 1, 16, 32, 1, false, false, L1H, L3H, true>( + passed &= test<1, T, 1, 1, 16, 32, 1, false, false, L1H, L2H, true>( 24, 64, 64, 6, 21); passed &= - test<2, T, 2, 2, 8, 4, 1, false, false, L1H, L3H, true>(16, 16, 32, 2, 5); + test<2, T, 2, 2, 8, 4, 1, false, false, L1H, L2H, true>(16, 16, 32, 2, 5); passed &= - test<3, T, 1, 1, 8, 4, 2, false, false, L1H, L3H, true>(16, 7, 32, 4, 1); + test<3, T, 1, 1, 8, 4, 2, false, false, L1H, L2H, true>(16, 7, 32, 4, 1); // transformed passed &= - test<4, T, 1, 1, 8, 4, 4, false, true, L1H, L3H, true>(16, 10, 32, 6, 5); - passed &= test<5, T, 1, 1, 6, 10, 1, false, true, L1H, L3H, true>(18, 10, 32, + test<4, T, 1, 1, 8, 4, 4, false, true, L1H, L2H, true>(16, 10, 32, 6, 5); + passed &= test<5, T, 1, 1, 6, 10, 1, false, true, L1H, L2H, true>(18, 10, 32, 12, 0); passed &= - test<6, T, 1, 1, 16, 2, 2, false, true, L1H, L3H, true>(32, 4, 32, 4, 1); + test<6, T, 1, 1, 16, 2, 2, false, true, L1H, L2H, true>(32, 4, 32, 4, 1); passed &= - test<7, T, 2, 2, 2, 16, 2, false, true, L1H, L3H, true>(4, 20, 32, 0, 3); - passed &= test<8, T, 1, 1, 16, 32, 1, false, true, L1H, L3H, true>(24, 50, 32, + test<7, T, 2, 2, 2, 16, 2, false, true, L1H, L2H, true>(4, 20, 32, 0, 3); + passed &= test<8, T, 1, 1, 16, 32, 1, false, true, L1H, L2H, true>(24, 50, 32, 4, 14); std::cout << (passed ? "Passed\n" : "FAILED\n"); diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u32.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u32.cpp index 1c1b4717c8944..6dfcf035c652f 100644 --- a/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u32.cpp +++ b/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u32.cpp @@ -15,29 +15,29 @@ constexpr uint32_t seed = 322; using T = uint32_t; constexpr cache_hint L1H = cache_hint::cached; -constexpr cache_hint L3H = cache_hint::uncached; +constexpr cache_hint L2H = cache_hint::uncached; int main(void) { srand(seed); bool passed = true; // non transposed, non transformed - passed &= test<1, T, 1, 1, 16, 4, 1, false, false, L1H, L3H, true>(16, 16, 32, + passed &= test<1, T, 1, 1, 16, 4, 1, false, false, L1H, L2H, true>(16, 16, 32, 2, 1); passed &= - test<2, T, 2, 2, 8, 4, 1, false, false, L1H, L3H, true>(16, 16, 16, 1, 5); + test<2, T, 2, 2, 8, 4, 1, false, false, L1H, L2H, true>(16, 16, 16, 1, 5); passed &= - test<3, T, 1, 1, 8, 2, 2, false, false, L1H, L3H, true>(16, 4, 16, 3, 1); + test<3, T, 1, 1, 8, 2, 2, false, false, L1H, L2H, true>(16, 4, 16, 3, 1); // transposed passed &= - test<4, T, 1, 1, 1, 16, 1, true, false, L1H, L3H, true>(16, 20, 16, 1, 2); + test<4, T, 1, 1, 1, 16, 1, true, false, L1H, L2H, true>(16, 20, 16, 1, 2); passed &= - test<5, T, 1, 1, 2, 8, 1, true, false, L1H, L3H, true>(16, 10, 16, 10, 1); + test<5, T, 1, 1, 2, 8, 1, true, false, L1H, L2H, true>(16, 10, 16, 10, 1); passed &= - test<6, T, 1, 1, 4, 8, 1, true, false, L1H, L3H, true>(16, 10, 16, 11, 1); + test<6, T, 1, 1, 4, 8, 1, true, false, L1H, L2H, true>(16, 10, 16, 11, 1); passed &= - test<7, T, 2, 2, 8, 2, 1, true, false, L1H, L3H, true>(16, 4, 16, 1, 1); + test<7, T, 2, 2, 8, 2, 1, true, false, L1H, L2H, true>(16, 4, 16, 1, 1); std::cout << (passed ? "Passed\n" : "FAILED\n"); return passed ? 0 : 1; diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u64.cpp index e691e1bb61e78..59bc2bd51497b 100644 --- a/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u64.cpp +++ b/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u64.cpp @@ -15,7 +15,7 @@ constexpr uint32_t seed = 322; using T = uint64_t; constexpr cache_hint L1H = cache_hint::cached; -constexpr cache_hint L3H = cache_hint::uncached; +constexpr cache_hint L2H = cache_hint::uncached; int main(void) { srand(seed); @@ -23,21 +23,21 @@ int main(void) { // non transposed, non transformed passed &= - test<1, T, 1, 1, 8, 32, 1, false, false, L1H, L3H, true>(8, 32, 8, 0, 0); + test<1, T, 1, 1, 8, 32, 1, false, false, L1H, L2H, true>(8, 32, 8, 0, 0); passed &= - test<2, T, 2, 2, 8, 4, 1, false, false, L1H, L3H, true>(16, 16, 16, 1, 5); + test<2, T, 2, 2, 8, 4, 1, false, false, L1H, L2H, true>(16, 16, 16, 1, 5); passed &= - test<3, T, 1, 1, 4, 2, 1, false, false, L1H, L3H, true>(16, 4, 16, 3, 1); + test<3, T, 1, 1, 4, 2, 1, false, false, L1H, L2H, true>(16, 4, 16, 3, 1); // transposed passed &= - test<4, T, 1, 1, 1, 8, 1, true, false, L1H, L3H, true>(16, 10, 16, 1, 2); + test<4, T, 1, 1, 1, 8, 1, true, false, L1H, L2H, true>(16, 10, 16, 1, 2); passed &= - test<5, T, 1, 1, 2, 8, 1, true, false, L1H, L3H, true>(16, 10, 16, 10, 1); + test<5, T, 1, 1, 2, 8, 1, true, false, L1H, L2H, true>(16, 10, 16, 10, 1); passed &= - test<6, T, 1, 1, 4, 8, 1, true, false, L1H, L3H, true>(16, 10, 16, 11, 1); + test<6, T, 1, 1, 4, 8, 1, true, false, L1H, L2H, true>(16, 10, 16, 11, 1); passed &= - test<7, T, 2, 2, 4, 8, 1, true, false, L1H, L3H, true>(16, 9, 16, 1, 1); + test<7, T, 2, 2, 4, 8, 1, true, false, L1H, L2H, true>(16, 9, 16, 1, 1); std::cout << (passed ? "Passed\n" : "FAILED\n"); return passed ? 0 : 1; diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u8.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u8.cpp index 27e0361a9d0d2..cf703ffa84db7 100644 --- a/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u8.cpp +++ b/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u8.cpp @@ -15,30 +15,30 @@ constexpr uint32_t seed = 322; using T = uint8_t; constexpr cache_hint L1H = cache_hint::cached; -constexpr cache_hint L3H = cache_hint::uncached; +constexpr cache_hint L2H = cache_hint::uncached; int main(void) { srand(seed); bool passed = true; // non transposed, non transformed - passed &= test<1, T, 1, 1, 16, 32, 2, false, false, L1H, L3H, true>( + passed &= test<1, T, 1, 1, 16, 32, 2, false, false, L1H, L2H, true>( 40, 64, 64, 4, 21); passed &= - test<2, T, 2, 2, 8, 8, 2, false, false, L1H, L3H, true>(16, 16, 64, 8, 5); - passed &= test<3, T, 1, 1, 8, 32, 2, false, false, L1H, L3H, true>(16, 80, 64, + test<2, T, 2, 2, 8, 8, 2, false, false, L1H, L2H, true>(16, 16, 64, 8, 5); + passed &= test<3, T, 1, 1, 8, 32, 2, false, false, L1H, L2H, true>(16, 80, 64, 4, 1); // transformed - passed &= test<4, T, 1, 1, 16, 4, 4, false, true, L1H, L3H, true>(100, 10, + passed &= test<4, T, 1, 1, 16, 4, 4, false, true, L1H, L2H, true>(100, 10, 128, 16, 5); - passed &= test<5, T, 1, 1, 12, 20, 1, false, true, L1H, L3H, true>(16, 40, 64, + passed &= test<5, T, 1, 1, 12, 20, 1, false, true, L1H, L2H, true>(16, 40, 64, 0, 0); passed &= - test<6, T, 1, 1, 16, 4, 2, false, true, L1H, L3H, true>(32, 4, 64, 4, 1); + test<6, T, 1, 1, 16, 4, 2, false, true, L1H, L2H, true>(32, 4, 64, 4, 1); passed &= - test<7, T, 2, 2, 4, 16, 2, false, true, L1H, L3H, true>(4, 20, 64, 0, 3); - passed &= test<8, T, 1, 1, 16, 32, 1, false, true, L1H, L3H, true>(24, 80, 64, + test<7, T, 2, 2, 4, 16, 2, false, true, L1H, L2H, true>(4, 20, 64, 0, 3); + passed &= test<8, T, 1, 1, 16, 32, 1, false, true, L1H, L2H, true>(24, 80, 64, 4, 14); std::cout << (passed ? "Passed\n" : "FAILED\n"); diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u16u32.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u16u32.cpp index e011932e8f596..15b71f20bc52f 100644 --- a/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u16u32.cpp +++ b/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u16u32.cpp @@ -15,19 +15,19 @@ constexpr uint32_t seed = 197; constexpr lsc_data_size DS = lsc_data_size::u16u32; constexpr cache_hint L1H = cache_hint::cached; -constexpr cache_hint L3H = cache_hint::uncached; +constexpr cache_hint L2H = cache_hint::uncached; int main(void) { srand(seed); bool passed = true; // non-transpose - passed &= test<0, uint32_t, 1, 1, 1, 1, false, DS, L1H, L3H, true>(rand()); - passed &= test<1, uint32_t, 1, 4, 32, 1, false, DS, L1H, L3H, true>(rand()); - passed &= test<2, uint32_t, 2, 4, 16, 1, false, DS, L1H, L3H, true>(rand()); - passed &= test<3, uint32_t, 2, 2, 8, 1, false, DS, L1H, L3H, true>(rand()); - passed &= test<4, uint32_t, 4, 2, 4, 1, false, DS, L1H, L3H, true>(rand()); - passed &= test<5, uint32_t, 4, 16, 2, 1, false, DS, L1H, L3H, true>(rand()); + passed &= test<0, uint32_t, 1, 1, 1, 1, false, DS, L1H, L2H, true>(rand()); + passed &= test<1, uint32_t, 1, 4, 32, 1, false, DS, L1H, L2H, true>(rand()); + passed &= test<2, uint32_t, 2, 4, 16, 1, false, DS, L1H, L2H, true>(rand()); + passed &= test<3, uint32_t, 2, 2, 8, 1, false, DS, L1H, L2H, true>(rand()); + passed &= test<4, uint32_t, 4, 2, 4, 1, false, DS, L1H, L2H, true>(rand()); + passed &= test<5, uint32_t, 4, 16, 2, 1, false, DS, L1H, L2H, true>(rand()); std::cout << (passed ? "Passed\n" : "FAILED\n"); return passed ? 0 : 1; diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u32.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u32.cpp index 4b027d696727c..152f7f657adb6 100644 --- a/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u32.cpp +++ b/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u32.cpp @@ -15,26 +15,26 @@ constexpr uint32_t seed = 199; constexpr lsc_data_size DS = lsc_data_size::u32; constexpr cache_hint L1H = cache_hint::cached; -constexpr cache_hint L3H = cache_hint::uncached; +constexpr cache_hint L2H = cache_hint::uncached; template bool tests() { bool passed = true; // non transpose passed &= - test(rand()); + test(rand()); passed &= - test(rand()); + test(rand()); passed &= - test(rand()); + test(rand()); passed &= - test(rand()); - passed &= test(1); - passed &= test(1); + test(rand()); + passed &= test(1); + passed &= test(1); // transpose - passed &= test(); - passed &= test(); - passed &= test(); + passed &= test(); + passed &= test(); + passed &= test(); return passed; } diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u64.cpp index 84a14bad846d0..170bc500e885d 100644 --- a/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u64.cpp +++ b/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u64.cpp @@ -15,26 +15,26 @@ constexpr uint32_t seed = 198; constexpr lsc_data_size DS = lsc_data_size::u64; constexpr cache_hint L1H = cache_hint::cached; -constexpr cache_hint L3H = cache_hint::uncached; +constexpr cache_hint L2H = cache_hint::uncached; template bool tests() { bool passed = true; // non transpose passed &= - test(rand()); + test(rand()); passed &= - test(rand()); + test(rand()); passed &= - test(rand()); + test(rand()); passed &= - test(rand()); - passed &= test(1); - passed &= test(1); + test(rand()); + passed &= test(1); + passed &= test(1); // transpose - passed &= test(); - passed &= test(); - passed &= test(); + passed &= test(); + passed &= test(); + passed &= test(); return passed; } diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u8u32.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u8u32.cpp index 4b7db2784163a..15721094223a1 100644 --- a/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u8u32.cpp +++ b/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u8u32.cpp @@ -15,19 +15,19 @@ constexpr uint32_t seed = 196; constexpr lsc_data_size DS = lsc_data_size::u8u32; constexpr cache_hint L1H = cache_hint::cached; -constexpr cache_hint L3H = cache_hint::uncached; +constexpr cache_hint L2H = cache_hint::uncached; int main(void) { srand(seed); bool passed = true; // non-transpose - passed &= test<0, uint32_t, 1, 1, 1, 1, false, DS, L1H, L3H, true>(rand()); - passed &= test<1, uint32_t, 1, 4, 32, 1, false, DS, L1H, L3H, true>(rand()); - passed &= test<2, uint32_t, 2, 4, 16, 1, false, DS, L1H, L3H, true>(rand()); - passed &= test<3, uint32_t, 2, 2, 8, 1, false, DS, L1H, L3H, true>(rand()); - passed &= test<4, uint32_t, 4, 2, 4, 1, false, DS, L1H, L3H, true>(rand()); - passed &= test<5, uint32_t, 4, 16, 2, 1, false, DS, L1H, L3H, true>(rand()); + passed &= test<0, uint32_t, 1, 1, 1, 1, false, DS, L1H, L2H, true>(rand()); + passed &= test<1, uint32_t, 1, 4, 32, 1, false, DS, L1H, L2H, true>(rand()); + passed &= test<2, uint32_t, 2, 4, 16, 1, false, DS, L1H, L2H, true>(rand()); + passed &= test<3, uint32_t, 2, 2, 8, 1, false, DS, L1H, L2H, true>(rand()); + passed &= test<4, uint32_t, 4, 2, 4, 1, false, DS, L1H, L2H, true>(rand()); + passed &= test<5, uint32_t, 4, 16, 2, 1, false, DS, L1H, L2H, true>(rand()); std::cout << (passed ? "Passed\n" : "FAILED\n"); return passed ? 0 : 1; diff --git a/sycl/test/esimd/check_lsc.cpp b/sycl/test/esimd/check_lsc.cpp index 4a211b7a7c2d9..e3f7ba939a7c4 100644 --- a/sycl/test/esimd/check_lsc.cpp +++ b/sycl/test/esimd/check_lsc.cpp @@ -18,23 +18,23 @@ using namespace sycl; // --- Postive tests. template ()> SYCL_EXTERNAL auto test_load(T *ptr, int width, int height, int pitch) SYCL_ESIMD_FUNCTION { return lsc_load_2d( + TRANSFORM, L1H, L2H>( ptr, width * sizeof(T) - 1, height - 1, pitch * sizeof(T) - 1, 0, 0); } template ()> SYCL_EXTERNAL void test_store(T *ptr, simd v, int width, int height, int pitch) SYCL_ESIMD_FUNCTION { - lsc_store_2d( + lsc_store_2d( ptr, width * sizeof(T) - 1, height - 1, pitch * sizeof(T) - 1, 0, 0, v); }