From 1c9d479057016cf0d13cfc9fc8d6617c1b0e829a Mon Sep 17 00:00:00 2001 From: "Klochkov, Vyacheslav N" Date: Wed, 28 Feb 2024 20:28:43 -0800 Subject: [PATCH] [ESIMD][NFC] Rename L3 cache hint parameters to L2 to avoid confuses The experimental memory API used L3 to name the cache that always was L2 due to some historical reasons. The new non-experimental API names the same cache level as L2. This patch just renames the old L3 params to L2 to unify the naming in experimental and non-experimental namespaces and avoid the mess in cache names. Signed-off-by: Klochkov, Vyacheslav N --- .../esimd/detail/memory_intrin.hpp | 20 +- .../ext/intel/experimental/esimd/memory.hpp | 360 +++++++++--------- .../ESIMD/lsc/Inputs/lsc_load_prefetch_2d.hpp | 6 +- .../ESIMD/lsc/Inputs/lsc_store_2d.hpp | 4 +- .../ESIMD/lsc/Inputs/lsc_surf_load.hpp | 16 +- .../ESIMD/lsc/Inputs/lsc_surf_store.hpp | 8 +- .../Inputs/lsc_usm_block_load_prefetch.hpp | 78 ++-- .../lsc/Inputs/lsc_usm_gather_prefetch.hpp | 30 +- .../ESIMD/lsc/Inputs/lsc_usm_store.hpp | 8 +- .../ESIMD/lsc/lsc_prefetch_2d_u16.cpp | 18 +- .../ESIMD/lsc/lsc_prefetch_2d_u32.cpp | 16 +- .../ESIMD/lsc/lsc_prefetch_2d_u64.cpp | 16 +- .../test-e2e/ESIMD/lsc/lsc_prefetch_2d_u8.cpp | 18 +- .../ESIMD/lsc/lsc_surf_prefetch_u16u32.cpp | 14 +- .../ESIMD/lsc/lsc_surf_prefetch_u32.cpp | 20 +- .../ESIMD/lsc/lsc_surf_prefetch_u64.cpp | 20 +- .../ESIMD/lsc/lsc_surf_prefetch_u8u32.cpp | 14 +- sycl/test/esimd/check_lsc.cpp | 8 +- 18 files changed, 337 insertions(+), 337 deletions(-) diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp index 2a8606d61cde6..4863486841955 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp @@ -69,7 +69,7 @@ __ESIMD_INTRIN void __esimd_raw_send_nbarrier_signal( /// /// @tparam Ty is element type. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam AddressScale is the address scale. /// @tparam ImmOffset is the immediate offset added to each address. /// @tparam DS is the data size. @@ -80,7 +80,7 @@ __ESIMD_INTRIN void __esimd_raw_send_nbarrier_signal( /// @param pred is predicates. /// @param offsets is the zero-based offsets in bytes. /// @param surf_ind is the surface index. -template pred, /// /// @tparam Ty is element type. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam AddressScale is the address scale. /// @tparam ImmOffset is the immediate offset added to each address. /// @tparam DS is the data size. @@ -106,7 +106,7 @@ __esimd_lsc_prefetch_bti(__ESIMD_DNS::simd_mask_storage_t pred, /// @tparam N is the SIMD size of operation (the number of addresses to access) /// @param pred is predicates. /// @param addrs is the prefetch addresses. -template @@ -122,7 +122,7 @@ __ESIMD_INTRIN void __esimd_lsc_prefetch_stateless( /// /// @tparam Ty is element type. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam DS is the data size. /// @tparam Transposed is the transposed version or not. /// @tparam NBlocks is the number of blocks. @@ -144,7 +144,7 @@ __ESIMD_INTRIN void __esimd_lsc_prefetch_stateless( /// otherwise, /// N = roundUpNextMultiple(BlockHeight, 4 / sizeof(T)) * /// getNextPowerOf2(BlockWidth) * NBlocks -template @@ -160,7 +160,7 @@ __esimd_lsc_load2d_stateless(__ESIMD_DNS::simd_mask_storage_t Pred, /// /// @tparam Ty is element type. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam DS is the data size. /// @tparam NBlocks is the number of blocks. /// @tparam Transposed is the transposed version or not. @@ -177,7 +177,7 @@ __esimd_lsc_load2d_stateless(__ESIMD_DNS::simd_mask_storage_t Pred, /// number of elements. /// @param Y is zero based Y-coordinate of the left upper rectangle corner in /// rows. -template @@ -192,7 +192,7 @@ __ESIMD_INTRIN void __esimd_lsc_prefetch2d_stateless( /// /// @tparam Ty is element type. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam DS is the data size. /// @tparam Transposed is the transposed version or not. /// @tparam NBlocks is the number of blocks. @@ -214,7 +214,7 @@ __ESIMD_INTRIN void __esimd_lsc_prefetch2d_stateless( /// otherwise, /// N = roundUpNextMultiple(BlockHeight, 4 / sizeof(T)) * /// getNextPowerOf2(BlockWidth) * NBlocks -template diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp index 26939894f2efe..2ab4c798d1e06 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp @@ -481,55 +481,55 @@ template constexpr uint32_t get_lsc_data_size() { } } -template +template constexpr uint32_t get_lsc_load_cache_mask() { if constexpr (L1H == cache_hint::read_invalidate && - L3H == cache_hint::cached) { + L2H == cache_hint::cached) { return 7; } - if constexpr (L1H == cache_hint::streaming && L3H == cache_hint::cached) { + if constexpr (L1H == cache_hint::streaming && L2H == cache_hint::cached) { return 6; } - if constexpr (L1H == cache_hint::streaming && L3H == cache_hint::uncached) { + if constexpr (L1H == cache_hint::streaming && L2H == cache_hint::uncached) { return 5; } - if constexpr (L1H == cache_hint::cached && L3H == cache_hint::cached) { + if constexpr (L1H == cache_hint::cached && L2H == cache_hint::cached) { return 4; } - if constexpr (L1H == cache_hint::cached && L3H == cache_hint::uncached) { + if constexpr (L1H == cache_hint::cached && L2H == cache_hint::uncached) { return 3; } - if constexpr (L1H == cache_hint::uncached && L3H == cache_hint::cached) { + if constexpr (L1H == cache_hint::uncached && L2H == cache_hint::cached) { return 2; } - if constexpr (L1H == cache_hint::uncached && L3H == cache_hint::uncached) { + if constexpr (L1H == cache_hint::uncached && L2H == cache_hint::uncached) { return 1; } return 0; } -template +template constexpr uint32_t get_lsc_store_cache_mask() { - if constexpr (L1H == cache_hint::write_back && L3H == cache_hint::cached) { + if constexpr (L1H == cache_hint::write_back && L2H == cache_hint::cached) { return 7; } - if constexpr (L1H == cache_hint::streaming && L3H == cache_hint::cached) { + if constexpr (L1H == cache_hint::streaming && L2H == cache_hint::cached) { return 6; } - if constexpr (L1H == cache_hint::streaming && L3H == cache_hint::uncached) { + if constexpr (L1H == cache_hint::streaming && L2H == cache_hint::uncached) { return 5; } - if constexpr (L1H == cache_hint::write_through && L3H == cache_hint::cached) { + if constexpr (L1H == cache_hint::write_through && L2H == cache_hint::cached) { return 4; } if constexpr (L1H == cache_hint::write_through && - L3H == cache_hint::uncached) { + L2H == cache_hint::uncached) { return 3; } - if constexpr (L1H == cache_hint::uncached && L3H == cache_hint::cached) { + if constexpr (L1H == cache_hint::uncached && L2H == cache_hint::cached) { return 2; } - if constexpr (L1H == cache_hint::uncached && L3H == cache_hint::uncached) { + if constexpr (L1H == cache_hint::uncached && L2H == cache_hint::uncached) { return 1; } return 0; @@ -647,7 +647,7 @@ lsc_slm_block_load(uint32_t offset, __ESIMD_NS::simd_mask<1> pred, /// @tparam NElts is the number of elements to load per address. /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam N is the number of channels (platform dependent). /// @param p is the base pointer. /// @param offsets is the zero-based offsets in bytes. @@ -656,12 +656,12 @@ lsc_slm_block_load(uint32_t offset, __ESIMD_NS::simd_mask<1> pred, /// template __ESIMD_API __ESIMD_NS::simd lsc_gather(const T *p, __ESIMD_NS::simd offsets, __ESIMD_NS::simd_mask pred = 1) { - return __ESIMD_DNS::gather_impl(p, offsets, pred); + return __ESIMD_DNS::gather_impl(p, offsets, pred); } /// USM pointer gather. @@ -675,7 +675,7 @@ lsc_gather(const T *p, __ESIMD_NS::simd offsets, /// @tparam NElts is the number of elements to load per address. /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam N is the number of channels (platform dependent). /// @param p is the base pointer. /// @param offsets is the zero-based offsets in bytes. @@ -686,58 +686,58 @@ lsc_gather(const T *p, __ESIMD_NS::simd offsets, /// template __ESIMD_API __ESIMD_NS::simd lsc_gather(const T *p, __ESIMD_NS::simd offsets, __ESIMD_NS::simd_mask pred, __ESIMD_NS::simd pass_thru) { - return __ESIMD_DNS::gather_impl(p, offsets, pred, + return __ESIMD_DNS::gather_impl(p, offsets, pred, pass_thru); } template __ESIMD_API __ESIMD_NS::simd lsc_gather(const T *p, __ESIMD_NS::simd_view offsets, __ESIMD_NS::simd_mask pred = 1) { - return lsc_gather(p, offsets.read(), pred); + return lsc_gather(p, offsets.read(), pred); } template __ESIMD_API __ESIMD_NS::simd lsc_gather(const T *p, __ESIMD_NS::simd_view offsets, __ESIMD_NS::simd_mask pred, __ESIMD_NS::simd pass_thru) { - return lsc_gather(p, offsets.read(), pred, + return lsc_gather(p, offsets.read(), pred, pass_thru); } template __ESIMD_API std::enable_if_t, __ESIMD_NS::simd> lsc_gather(const T *p, Toffset offset, __ESIMD_NS::simd_mask pred = 1) { - return lsc_gather( + return lsc_gather( p, __ESIMD_NS::simd(offset), pred); } template __ESIMD_API std::enable_if_t, __ESIMD_NS::simd> lsc_gather(const T *p, Toffset offset, __ESIMD_NS::simd_mask pred, __ESIMD_NS::simd pass_thru) { - return lsc_gather( + return lsc_gather( p, __ESIMD_NS::simd(offset), pred, pass_thru); } @@ -752,7 +752,7 @@ lsc_gather(const T *p, Toffset offset, __ESIMD_NS::simd_mask pred, /// @tparam NElts is the number of elements to load per address. /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam N is the number of channels (platform dependent). /// @tparam AccessorTy is the \ref sycl::accessor type. /// @param acc is the SYCL accessor. @@ -762,7 +762,7 @@ lsc_gather(const T *p, Toffset offset, __ESIMD_NS::simd_mask pred, /// template __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_device_accessor_with_v< @@ -772,11 +772,11 @@ __ESIMD_API __ESIMD_NS::simd<__ESIMD_DNS::DeviceAccessorOffsetT, N> offsets, __ESIMD_NS::simd_mask pred = 1) { #ifdef __ESIMD_FORCE_STATELESS_MEM - return lsc_gather( + return lsc_gather( reinterpret_cast(acc.get_pointer().get()), offsets, pred); #else __ESIMD_NS::simd PassThru; // Intentionally unitialized. - return __ESIMD_DNS::gather_impl( + return __ESIMD_DNS::gather_impl( acc, offsets, pred, PassThru); #endif // __ESIMD_FORCE_STATELESS_MEM } @@ -784,7 +784,7 @@ __ESIMD_API #ifdef __ESIMD_FORCE_STATELESS_MEM template __ESIMD_API std::enable_if_t< __ESIMD_DNS::is_device_accessor_with_v< @@ -793,14 +793,14 @@ __ESIMD_API std::enable_if_t< __ESIMD_NS::simd> lsc_gather(AccessorTy acc, __ESIMD_NS::simd offsets, __ESIMD_NS::simd_mask pred = 1) { - return lsc_gather( + return lsc_gather( acc, convert(offsets), pred); } #endif template __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_local_accessor_with_v< @@ -823,7 +823,7 @@ __ESIMD_API /// @tparam NElts is the number of elements to load per address. /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam N is the number of channels (platform dependent). /// @tparam AccessorTy is the \ref sycl::accessor type. /// @param acc is the SYCL accessor. @@ -835,7 +835,7 @@ __ESIMD_API /// template __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_device_accessor_with_v< @@ -846,11 +846,11 @@ __ESIMD_API __ESIMD_NS::simd_mask pred, __ESIMD_NS::simd pass_thru) { #ifdef __ESIMD_FORCE_STATELESS_MEM - return lsc_gather( + return lsc_gather( reinterpret_cast(acc.get_pointer().get()), offsets, pred, pass_thru); #else - return __ESIMD_DNS::gather_impl( + return __ESIMD_DNS::gather_impl( acc, offsets, pred, pass_thru); #endif // __ESIMD_FORCE_STATELESS_MEM } @@ -858,7 +858,7 @@ __ESIMD_API #ifdef __ESIMD_FORCE_STATELESS_MEM template __ESIMD_API std::enable_if_t< __ESIMD_DNS::is_device_accessor_with_v< @@ -868,14 +868,14 @@ __ESIMD_API std::enable_if_t< lsc_gather(AccessorTy acc, __ESIMD_NS::simd offsets, __ESIMD_NS::simd_mask pred, __ESIMD_NS::simd pass_thru) { - return lsc_gather( + return lsc_gather( acc, convert(offsets), pred, pass_thru); } #endif template __ESIMD_API std::enable_if_t< sycl::detail::acc_properties::is_local_accessor_v, @@ -912,7 +912,7 @@ lsc_gather(AccessorTy acc, __ESIMD_NS::simd offsets, /// @tparam NElts is the number of elements to load per address. /// @tparam DS is the data size (unused/obsolete). /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @param p is the base pointer. /// @param pred is operation predicate. Zero means operation is skipped /// entirely, non-zero - operation is performed. The default is '1' - @@ -923,13 +923,13 @@ lsc_gather(AccessorTy acc, __ESIMD_NS::simd offsets, /// are undefined. /// template __ESIMD_API std::enable_if_t<__ESIMD_NS::is_simd_flag_type_v, __ESIMD_NS::simd> lsc_block_load(const T *p, __ESIMD_NS::simd_mask<1> pred = 1, FlagsT flags = FlagsT{}) { - return __ESIMD_DNS::block_load_impl(p, pred, flags); + return __ESIMD_DNS::block_load_impl(p, pred, flags); } /// A variation of lsc_block_load without predicate parameter to simplify use @@ -953,7 +953,7 @@ lsc_block_load(const T *p, __ESIMD_NS::simd_mask<1> pred = 1, /// @tparam NElts is the number of elements to load per address. /// @tparam DS is the data size (unused/obsolete). /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @param p is the base pointer. /// @param flags is the alignment specifier type tag. /// @return is a vector of type T and size NElts. The elements of the @@ -961,12 +961,12 @@ lsc_block_load(const T *p, __ESIMD_NS::simd_mask<1> pred = 1, /// are undefined. /// template __ESIMD_API std::enable_if_t<__ESIMD_NS::is_simd_flag_type_v, __ESIMD_NS::simd> lsc_block_load(const T *p, FlagsT flags) { - return __ESIMD_DNS::block_load_impl( + return __ESIMD_DNS::block_load_impl( p, __ESIMD_NS::simd_mask<1>(1), flags); } @@ -992,7 +992,7 @@ lsc_block_load(const T *p, FlagsT flags) { /// @tparam NElts is the number of elements to load per address. /// @tparam DS is the data size (unused/obsolete). /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @param p is the base pointer. /// @param pred is operation predicate. Zero means operation is skipped /// entirely, non-zero - operation is performed. @@ -1002,13 +1002,13 @@ lsc_block_load(const T *p, FlagsT flags) { /// @return is a vector of type T and size NElts. /// template __ESIMD_API std::enable_if_t<__ESIMD_NS::is_simd_flag_type_v, __ESIMD_NS::simd> lsc_block_load(const T *p, __ESIMD_NS::simd_mask<1> pred, __ESIMD_NS::simd pass_thru, FlagsT flags = FlagsT{}) { - return __ESIMD_DNS::block_load_impl(p, pred, pass_thru, + return __ESIMD_DNS::block_load_impl(p, pred, pass_thru, flags); } @@ -1032,7 +1032,7 @@ lsc_block_load(const T *p, __ESIMD_NS::simd_mask<1> pred, /// @tparam NElts is the number of elements to load per address. /// @tparam DS is the data size (unused/obsolete). /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam AccessorTy is the \ref sycl::accessor type. /// @param acc is the SYCL accessor. /// @param offset is the zero-based offset in bytes. @@ -1044,7 +1044,7 @@ lsc_block_load(const T *p, __ESIMD_NS::simd_mask<1> pred, /// vector for which the corresponding element in \p pred is 0 are undefined. /// template __ESIMD_API std::enable_if_t< @@ -1054,12 +1054,12 @@ __ESIMD_API std::enable_if_t< __ESIMD_NS::simd> lsc_block_load(AccessorTy acc, __ESIMD_DNS::DeviceAccessorOffsetT offset, __ESIMD_NS::simd_mask<1> pred = 1, FlagsT flags = FlagsT{}) { - return __ESIMD_DNS::block_load_impl(acc, offset, pred, + return __ESIMD_DNS::block_load_impl(acc, offset, pred, flags); } template __ESIMD_API std::enable_if_t< @@ -1092,7 +1092,7 @@ lsc_block_load(AccessorTy acc, uint32_t offset, /// @tparam NElts is the number of elements to load per address. /// @tparam DS is the data size (unused/obsolete). /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam AccessorTy is the \ref sycl::accessor type. /// @param acc is the SYCL accessor. /// @param offset is the zero-based offset in bytes. @@ -1101,7 +1101,7 @@ lsc_block_load(AccessorTy acc, uint32_t offset, /// vector for which the corresponding element in \p pred is 0 are undefined. /// template __ESIMD_API std::enable_if_t< @@ -1111,12 +1111,12 @@ __ESIMD_API std::enable_if_t< __ESIMD_NS::simd> lsc_block_load(AccessorTy acc, __ESIMD_DNS::DeviceAccessorOffsetT offset, FlagsT flags) { - return lsc_block_load( + return lsc_block_load( acc, offset, __ESIMD_NS::simd_mask<1>(1), flags); } template __ESIMD_API std::enable_if_t< @@ -1125,7 +1125,7 @@ __ESIMD_API std::enable_if_t< __ESIMD_NS::is_simd_flag_type_v, __ESIMD_NS::simd> lsc_block_load(AccessorTy acc, uint32_t offset, FlagsT flags) { - return lsc_block_load( + return lsc_block_load( acc, offset, __ESIMD_NS::simd_mask<1>(1), flags); } @@ -1149,7 +1149,7 @@ lsc_block_load(AccessorTy acc, uint32_t offset, FlagsT flags) { /// @tparam NElts is the number of elements to load per address. /// @tparam DS is the data size (unused/obsolete). /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam AccessorTy is the \ref sycl::accessor type. /// @param acc is the SYCL accessor. /// @param offset is the zero-based offset in bytes. @@ -1162,7 +1162,7 @@ lsc_block_load(AccessorTy acc, uint32_t offset, FlagsT flags) { /// @return is a vector of type T and size NElts /// template __ESIMD_API std::enable_if_t< @@ -1173,12 +1173,12 @@ __ESIMD_API std::enable_if_t< lsc_block_load(AccessorTy acc, __ESIMD_DNS::DeviceAccessorOffsetT offset, __ESIMD_NS::simd_mask<1> pred, __ESIMD_NS::simd pass_thru, FlagsT flags = FlagsT{}) { - return __ESIMD_DNS::block_load_impl(acc, offset, pred, + return __ESIMD_DNS::block_load_impl(acc, offset, pred, pass_thru, flags); } template __ESIMD_API std::enable_if_t< @@ -1202,7 +1202,7 @@ lsc_block_load(AccessorTy acc, uint32_t offset, __ESIMD_NS::simd_mask<1> pred, /// @tparam NElts is the number of elements to load per address. /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam N is the number of channels (platform dependent). /// @param p is the base pointer. /// @param offsets is the zero-based offsets in bytes. @@ -1210,14 +1210,14 @@ lsc_block_load(AccessorTy acc, uint32_t offset, __ESIMD_NS::simd_mask<1> pred, /// template __ESIMD_API void lsc_prefetch(const T *p, __ESIMD_NS::simd offsets, __ESIMD_NS::simd_mask pred = 1) { static_assert(std::is_integral_v, "Unsupported offset type"); detail::check_lsc_vector_size(); detail::check_lsc_data_size(); - detail::check_lsc_cache_hint(); + detail::check_lsc_cache_hint(); constexpr uint16_t _AddressScale = 1; constexpr int _ImmOffset = 0; constexpr lsc_data_size _DS = @@ -1228,28 +1228,28 @@ __ESIMD_API void lsc_prefetch(const T *p, __ESIMD_NS::simd offsets, using MsgT = typename detail::lsc_expand_type::type; __ESIMD_NS::simd addrs = reinterpret_cast(p); addrs += convert(offsets); - __esimd_lsc_prefetch_stateless(pred.data(), addrs.data()); } template __ESIMD_API void lsc_prefetch(const T *p, __ESIMD_NS::simd_view offsets, __ESIMD_NS::simd_mask pred = 1) { - lsc_prefetch(p, offsets.read(), pred); + lsc_prefetch(p, offsets.read(), pred); } template __ESIMD_API std::enable_if_t> lsc_prefetch(const T *p, Toffset offset, __ESIMD_NS::simd_mask pred = 1) { - lsc_prefetch( + lsc_prefetch( p, __ESIMD_NS::simd(offset), pred); } @@ -1263,16 +1263,16 @@ lsc_prefetch(const T *p, Toffset offset, __ESIMD_NS::simd_mask pred = 1) { /// @tparam NElts is the number of elements to load per address. /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @param p is the base pointer. /// template + cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none> __ESIMD_API void lsc_prefetch(const T *p) { detail::check_lsc_vector_size(); detail::check_lsc_data_size(); - detail::check_lsc_cache_hint(); + detail::check_lsc_cache_hint(); constexpr uint16_t _AddressScale = 1; constexpr int _ImmOffset = 0; constexpr lsc_data_size _DS = detail::finalize_data_size(); @@ -1287,7 +1287,7 @@ __ESIMD_API void lsc_prefetch(const T *p) { __ESIMD_NS::simd_mask pred = 1; __ESIMD_NS::simd addrs = reinterpret_cast(p); - __esimd_lsc_prefetch_stateless(pred.data(), addrs.data()); } @@ -1302,7 +1302,7 @@ __ESIMD_API void lsc_prefetch(const T *p) { /// @tparam NElts is the number of elements to load per address. /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam N is the number of channels (platform dependent). /// @tparam AccessorTy is the \ref sycl::accessor type. /// @param acc is the SYCL accessor. @@ -1311,7 +1311,7 @@ __ESIMD_API void lsc_prefetch(const T *p) { /// template __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_device_accessor_with_v< AccessorTy, __ESIMD_DNS::accessor_mode_cap::can_read>> @@ -1323,12 +1323,12 @@ lsc_prefetch(AccessorTy acc, #endif __ESIMD_NS::simd_mask pred = 1) { #ifdef __ESIMD_FORCE_STATELESS_MEM - return lsc_prefetch( + return lsc_prefetch( __ESIMD_DNS::accessorToPointer(acc), offsets, pred); #else detail::check_lsc_vector_size(); detail::check_lsc_data_size(); - detail::check_lsc_cache_hint(); + detail::check_lsc_cache_hint(); constexpr uint16_t _AddressScale = 1; constexpr int _ImmOffset = 0; constexpr lsc_data_size _DS = @@ -1338,7 +1338,7 @@ lsc_prefetch(AccessorTy acc, detail::lsc_data_order::nontranspose; using MsgT = typename detail::lsc_expand_type::type; auto si = __ESIMD_NS::get_surface_index(acc); - __esimd_lsc_prefetch_bti(pred.data(), offsets.data(), si); #endif } @@ -1346,7 +1346,7 @@ lsc_prefetch(AccessorTy acc, #ifdef __ESIMD_FORCE_STATELESS_MEM template __ESIMD_API std::enable_if_t< __ESIMD_DNS::is_device_accessor_with_v< @@ -1354,7 +1354,7 @@ __ESIMD_API std::enable_if_t< std::is_integral_v && !std::is_same_v> lsc_prefetch(AccessorTy acc, __ESIMD_NS::simd offsets, __ESIMD_NS::simd_mask pred = 1) { - lsc_prefetch( + lsc_prefetch( acc, convert(offsets), pred); } #endif @@ -1369,25 +1369,25 @@ lsc_prefetch(AccessorTy acc, __ESIMD_NS::simd offsets, /// @tparam NElts is the number of elements to load per address. /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam AccessorTy is the \ref sycl::accessor type. /// @param acc is the SYCL accessor. /// @param offset is the zero-based offset in bytes. /// template __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_device_accessor_with_v< AccessorTy, __ESIMD_DNS::accessor_mode_cap::can_read>> lsc_prefetch(AccessorTy acc, __ESIMD_DNS::DeviceAccessorOffsetT offset) { #ifdef __ESIMD_FORCE_STATELESS_MEM - lsc_prefetch( + lsc_prefetch( __ESIMD_DNS::accessorToPointer(acc, offset)); #else detail::check_lsc_vector_size(); detail::check_lsc_data_size(); - detail::check_lsc_cache_hint(); + detail::check_lsc_cache_hint(); constexpr uint16_t _AddressScale = 1; constexpr int _ImmOffset = 0; constexpr lsc_data_size _DS = detail::finalize_data_size(); @@ -1401,7 +1401,7 @@ lsc_prefetch(AccessorTy acc, __ESIMD_DNS::DeviceAccessorOffsetT offset) { __ESIMD_NS::simd_mask pred = 1; __ESIMD_NS::simd offsets = offset; auto si = __ESIMD_NS::get_surface_index(acc); - __esimd_lsc_prefetch_bti(pred.data(), offsets.data(), si); #endif } @@ -1460,7 +1460,7 @@ __ESIMD_API void lsc_slm_block_store(uint32_t offset, /// @tparam NElts is the number of elements to store per address. /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam N is the number of channels (platform dependent). /// @param p is the base pointer. /// @param offsets is the zero-based offsets in bytes. @@ -1469,34 +1469,34 @@ __ESIMD_API void lsc_slm_block_store(uint32_t offset, /// template __ESIMD_API void lsc_scatter(T *p, __ESIMD_NS::simd offsets, __ESIMD_NS::simd vals, __ESIMD_NS::simd_mask pred = 1) { - __ESIMD_DNS::scatter_impl(p, offsets, + __ESIMD_DNS::scatter_impl(p, offsets, vals, pred); } template __ESIMD_API void lsc_scatter(T *p, __ESIMD_NS::simd_view offsets, __ESIMD_NS::simd vals, __ESIMD_NS::simd_mask pred = 1) { - lsc_scatter(p, offsets.read(), vals, pred); + lsc_scatter(p, offsets.read(), vals, pred); } template __ESIMD_API std::enable_if_t && N == 1> lsc_scatter(T *p, Toffset offset, __ESIMD_NS::simd vals, __ESIMD_NS::simd_mask pred = 1) { - lsc_scatter( + lsc_scatter( p, __ESIMD_NS::simd(offset), vals, pred); } @@ -1510,7 +1510,7 @@ lsc_scatter(T *p, Toffset offset, __ESIMD_NS::simd vals, /// @tparam NElts is the number of elements to store per address. /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam N is the number of channels (platform dependent). /// @tparam AccessorTy is the \ref sycl::accessor type. /// @param acc is the SYCL accessor. @@ -1520,7 +1520,7 @@ lsc_scatter(T *p, Toffset offset, __ESIMD_NS::simd vals, /// template __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_device_accessor_with_v< AccessorTy, __ESIMD_DNS::accessor_mode_cap::can_write>> @@ -1529,17 +1529,17 @@ lsc_scatter(AccessorTy acc, __ESIMD_NS::simd vals, __ESIMD_NS::simd_mask pred = 1) { #ifdef __ESIMD_FORCE_STATELESS_MEM - lsc_scatter(__ESIMD_DNS::accessorToPointer(acc), + lsc_scatter(__ESIMD_DNS::accessorToPointer(acc), offsets, vals, pred); #else - __ESIMD_DNS::scatter_impl(acc, offsets, vals, pred); + __ESIMD_DNS::scatter_impl(acc, offsets, vals, pred); #endif } #ifdef __ESIMD_FORCE_STATELESS_MEM template __ESIMD_API std::enable_if_t< __ESIMD_DNS::is_device_accessor_with_v< @@ -1548,14 +1548,14 @@ __ESIMD_API std::enable_if_t< lsc_scatter(AccessorTy acc, __ESIMD_NS::simd offsets, __ESIMD_NS::simd vals, __ESIMD_NS::simd_mask pred = 1) { - lsc_scatter( + lsc_scatter( acc, convert(offsets), vals, pred); } #endif template __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_local_accessor_with_v< AccessorTy, __ESIMD_DNS::accessor_mode_cap::can_write>> @@ -1590,7 +1590,7 @@ lsc_scatter(AccessorTy acc, __ESIMD_NS::simd offsets, /// @tparam NElts is the number of elements to store per address. /// @tparam DS is the data size (unused/obsolete). /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @param p is the base pointer. /// @param vals is values to store. /// @param pred is operation predicate. Zero means operation is skipped @@ -1599,12 +1599,12 @@ lsc_scatter(AccessorTy acc, __ESIMD_NS::simd offsets, /// @param flags is the alignment specifier type tag. /// template __ESIMD_API std::enable_if_t<__ESIMD_NS::is_simd_flag_type_v> lsc_block_store(T *p, __ESIMD_NS::simd vals, __ESIMD_NS::simd_mask<1> pred = 1, FlagsT flags = FlagsT{}) { - return __ESIMD_DNS::block_store_impl(p, vals, pred, + return __ESIMD_DNS::block_store_impl(p, vals, pred, flags); } @@ -1631,17 +1631,17 @@ lsc_block_store(T *p, __ESIMD_NS::simd vals, /// @tparam NElts is the number of elements to store per address. /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @param p is the base pointer. /// @param vals is values to store. /// @param flags is the alignment specifier type tag. /// template __ESIMD_API std::enable_if_t<__ESIMD_NS::is_simd_flag_type_v> lsc_block_store(T *p, __ESIMD_NS::simd vals, FlagsT flags) { - lsc_block_store(p, vals, __ESIMD_NS::simd_mask<1>(1), + lsc_block_store(p, vals, __ESIMD_NS::simd_mask<1>(1), flags); } @@ -1669,7 +1669,7 @@ lsc_block_store(T *p, __ESIMD_NS::simd vals, FlagsT flags) { /// @tparam NElts is the number of elements to store per address. /// @tparam DS is the data size (unused/obsolete). /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam AccessorTy is the \ref sycl::accessor type. /// @param acc is the SYCL accessor. /// @param offset is the zero-based offset in bytes. @@ -1680,7 +1680,7 @@ lsc_block_store(T *p, __ESIMD_NS::simd vals, FlagsT flags) { /// @param flags is the alignment specifier type tag. /// template __ESIMD_API std::enable_if_t< @@ -1690,12 +1690,12 @@ __ESIMD_API std::enable_if_t< lsc_block_store(AccessorTy acc, __ESIMD_DNS::DeviceAccessorOffsetT offset, __ESIMD_NS::simd vals, __ESIMD_NS::simd_mask<1> pred = 1, FlagsT flags = FlagsT{}) { - __ESIMD_DNS::block_store_impl(acc, offset, vals, pred, + __ESIMD_DNS::block_store_impl(acc, offset, vals, pred, flags); } template __ESIMD_API std::enable_if_t< @@ -1731,7 +1731,7 @@ lsc_block_store(AccessorTy acc, uint32_t offset, /// @tparam NElts is the number of elements to store per address. /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam AccessorTy is the \ref sycl::accessor type. /// @param acc is the SYCL accessor. /// @param offset is the zero-based offset in bytes. @@ -1739,7 +1739,7 @@ lsc_block_store(AccessorTy acc, uint32_t offset, /// @param flags is the alignment specifier type tag. /// template __ESIMD_API std::enable_if_t< @@ -1748,7 +1748,7 @@ __ESIMD_API std::enable_if_t< __ESIMD_NS::is_simd_flag_type_v> lsc_block_store(AccessorTy acc, __ESIMD_DNS::DeviceAccessorOffsetT offset, __ESIMD_NS::simd vals, FlagsT flags) { - lsc_block_store(acc, offset, vals, + lsc_block_store(acc, offset, vals, __ESIMD_NS::simd_mask<1>(1), flags); } @@ -1838,7 +1838,7 @@ constexpr void check_lsc_block_2d_restrictions() { /// @tparam Transposed is the transposed version or not. /// @tparam Transformed is apply VNNI transform or not. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam N is the data size /// @param Ptr is the surface base address for this operation. /// @param SurfaceWidth is the surface width minus 1 in bytes @@ -1856,14 +1856,14 @@ constexpr void check_lsc_block_2d_restrictions() { /// template ()> __ESIMD_API __ESIMD_NS::simd lsc_load_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight, unsigned SurfacePitch, int X, int Y) { using RawT = __ESIMD_DNS::__raw_t; - detail::check_lsc_cache_hint(); + detail::check_lsc_cache_hint(); detail::check_lsc_block_2d_restrictions(); @@ -1900,7 +1900,7 @@ lsc_load_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight, Transposed ? detail::lsc_data_order::transpose : detail::lsc_data_order::nontranspose; __ESIMD_NS::simd Raw = - __esimd_lsc_load2d_stateless(pred.data(), surf_addr, SurfaceWidth, SurfaceHeight, @@ -1954,7 +1954,7 @@ lsc_load_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight, /// @tparam BlockHeight is the block height in number of elements. /// @tparam NBlocks is the number of blocks. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam N is the data size /// @param Ptr is the surface base address for this operation. /// @param SurfaceWidth is the surface width minus 1 in bytes @@ -1966,13 +1966,13 @@ lsc_load_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight, /// rows. /// template ()> __ESIMD_API void lsc_prefetch_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight, unsigned SurfacePitch, int X, int Y) { - detail::check_lsc_cache_hint(); + detail::check_lsc_cache_hint(); detail::check_lsc_block_2d_restrictions(); @@ -1982,7 +1982,7 @@ __ESIMD_API void lsc_prefetch_2d(const T *Ptr, unsigned SurfaceWidth, uintptr_t surf_addr = reinterpret_cast(Ptr); constexpr detail::lsc_data_order _Transposed = detail::lsc_data_order::nontranspose; - __esimd_lsc_prefetch2d_stateless( pred.data(), surf_addr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y); } @@ -1997,7 +1997,7 @@ __ESIMD_API void lsc_prefetch_2d(const T *Ptr, unsigned SurfaceWidth, /// @tparam BlockWidth is the block width in number of elements. /// @tparam BlockHeight is the block height in number of elements. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam N is the data size /// @param Ptr is the surface base address for this operation. /// @param SurfaceWidth is the surface width minus 1 in bytes @@ -2012,14 +2012,14 @@ __ESIMD_API void lsc_prefetch_2d(const T *Ptr, unsigned SurfaceWidth, /// getNextPowerOf2(BlockWidth) * NBlocks /// template ()> __ESIMD_API void lsc_store_2d(T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight, unsigned SurfacePitch, int X, int Y, __ESIMD_NS::simd Vals) { using RawT = __ESIMD_DNS::__raw_t; - detail::check_lsc_cache_hint(); + detail::check_lsc_cache_hint(); detail::check_lsc_block_2d_restrictions(); @@ -2043,7 +2043,7 @@ __ESIMD_API void lsc_store_2d(T *Ptr, unsigned SurfaceWidth, } __ESIMD_NS::simd_mask pred = 1; - __esimd_lsc_store2d_stateless( pred.data(), surf_addr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y, Raw.data()); @@ -2236,19 +2236,19 @@ class config_2d_mem_access { __ESIMD_NS::simd payload_data; template friend ESIMD_INLINE SYCL_ESIMD_FUNCTION __ESIMD_NS::simd lsc_load_2d( config_2d_mem_access &payload); template + cache_hint L1H, cache_hint L2H, int N> friend ESIMD_INLINE SYCL_ESIMD_FUNCTION void lsc_store_2d( config_2d_mem_access &payload, __ESIMD_NS::simd Data); template friend ESIMD_INLINE SYCL_ESIMD_FUNCTION void lsc_prefetch_2d( config_2d_mem_access &payload); @@ -2266,7 +2266,7 @@ class config_2d_mem_access { /// @tparam Transposed is the transposed version or not. /// @tparam Transformed is apply VNNI transform or not. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam N is the data size /// @param payload is \c config_2d_mem_access \c object holding all the data /// @return is a vector of type T and size N, where N is @@ -2275,7 +2275,7 @@ class config_2d_mem_access { /// template ()> ESIMD_INLINE SYCL_ESIMD_FUNCTION __ESIMD_NS::simd lsc_load_2d( @@ -2283,7 +2283,7 @@ ESIMD_INLINE SYCL_ESIMD_FUNCTION __ESIMD_NS::simd lsc_load_2d( detail::check_lsc_block_2d_restrictions(); - detail::check_lsc_cache_hint(); + detail::check_lsc_cache_hint(); constexpr int ElemsPerDword = 4 / sizeof(T); constexpr int GRFRowSize = Transposed ? BlockHeight : Transformed ? BlockWidth * ElemsPerDword @@ -2311,7 +2311,7 @@ ESIMD_INLINE SYCL_ESIMD_FUNCTION __ESIMD_NS::simd lsc_load_2d( static_assert(N == ActualN || N == DstElements, "Incorrect element count"); - constexpr uint32_t cache_mask = detail::get_lsc_load_cache_mask() + constexpr uint32_t cache_mask = detail::get_lsc_load_cache_mask() << 17; constexpr uint32_t base_desc = 0x2000003; constexpr uint32_t transformMask = Transformed ? 1 << 7 : 0; @@ -2363,24 +2363,24 @@ ESIMD_INLINE SYCL_ESIMD_FUNCTION __ESIMD_NS::simd lsc_load_2d( /// @tparam Transposed is the transposed version or not. /// @tparam Transformed is apply VNNI transform or not. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam N is the data size /// @param payload is \c config_2d_mem_access \c object holding all the data /// template ()> ESIMD_INLINE SYCL_ESIMD_FUNCTION void lsc_prefetch_2d( config_2d_mem_access &payload) { - detail::check_lsc_cache_hint(); + detail::check_lsc_cache_hint(); detail::check_lsc_block_2d_restrictions(); static_assert(!Transposed || !Transformed, "Transposed and transformed is not supported"); - constexpr uint32_t cache_mask = detail::get_lsc_load_cache_mask() + constexpr uint32_t cache_mask = detail::get_lsc_load_cache_mask() << 17; constexpr uint32_t dataSizeMask = detail::get_lsc_data_size() << 9; constexpr uint32_t base_desc = 0x2000003; @@ -2406,13 +2406,13 @@ ESIMD_INLINE SYCL_ESIMD_FUNCTION void lsc_prefetch_2d( /// @tparam BlockHeight block height in number of elements /// @tparam NBlocks Number of blocks /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam N is the data size /// @param payload is \c config_2d_mem_access \c object holding all the data /// @param Data is the data to be stored. /// template ()> ESIMD_INLINE SYCL_ESIMD_FUNCTION void @@ -2421,9 +2421,9 @@ lsc_store_2d(config_2d_mem_access &payload, detail::check_lsc_block_2d_restrictions(); - detail::check_lsc_cache_hint(); + detail::check_lsc_cache_hint(); - constexpr uint32_t cache_mask = detail::get_lsc_store_cache_mask() + constexpr uint32_t cache_mask = detail::get_lsc_store_cache_mask() << 17; constexpr uint32_t dataSizeMask = detail::get_lsc_data_size() << 9; constexpr uint32_t base_desc = 0x2000007; @@ -2532,32 +2532,32 @@ lsc_slm_atomic_update(__ESIMD_NS::simd offsets, /// @tparam N is the number of channels (platform dependent). /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @param p is the base pointer. /// @param offsets is the zero-based offsets. /// @param pred is predicates. /// template <__ESIMD_NS::atomic_op Op, typename T, int N, lsc_data_size DS = lsc_data_size::default_size, - cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, + cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, typename Toffset> __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 0, __ESIMD_NS::simd> lsc_atomic_update(T *p, __ESIMD_NS::simd offsets, __ESIMD_NS::simd_mask pred) { - return __ESIMD_DNS::atomic_update_impl( + return __ESIMD_DNS::atomic_update_impl( p, offsets, pred); } template <__ESIMD_NS::atomic_op Op, typename T, int N, lsc_data_size DS = lsc_data_size::default_size, - cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, + cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, typename Toffset> __ESIMD_API std::enable_if_t && __ESIMD_DNS::get_num_args() == 0, __ESIMD_NS::simd> lsc_atomic_update(T *p, Toffset offset, __ESIMD_NS::simd_mask pred = 1) { - return lsc_atomic_update( + return lsc_atomic_update( p, __ESIMD_NS::simd(offset), pred); } @@ -2570,7 +2570,7 @@ lsc_atomic_update(T *p, Toffset offset, __ESIMD_NS::simd_mask pred = 1) { /// @tparam N is the number of channels (platform dependent). /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @param p is the base pointer. /// @param offsets is the zero-based offsets. /// @param src0 is the first atomic operand. @@ -2578,32 +2578,32 @@ lsc_atomic_update(T *p, Toffset offset, __ESIMD_NS::simd_mask pred = 1) { /// template <__ESIMD_NS::atomic_op Op, typename T, int N, lsc_data_size DS = lsc_data_size::default_size, - cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, + cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, typename Toffset> __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 1, __ESIMD_NS::simd> lsc_atomic_update(T *p, __ESIMD_NS::simd offsets, __ESIMD_NS::simd src0, __ESIMD_NS::simd_mask pred) { - return __ESIMD_DNS::atomic_update_impl( + return __ESIMD_DNS::atomic_update_impl( p, offsets, src0, pred); } template <__ESIMD_NS::atomic_op Op, typename T, int N, lsc_data_size DS = lsc_data_size::default_size, - cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, + cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, typename OffsetObjT, typename RegionTy> __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 1, __ESIMD_NS::simd> lsc_atomic_update(T *p, __ESIMD_NS::simd_view offsets, __ESIMD_NS::simd src0, __ESIMD_NS::simd_mask pred = 1) { - return lsc_atomic_update(p, offsets.read(), src0, + return lsc_atomic_update(p, offsets.read(), src0, pred); } template <__ESIMD_NS::atomic_op Op, typename T, int N, lsc_data_size DS = lsc_data_size::default_size, - cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, + cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, typename Toffset> __ESIMD_API std::enable_if_t && __ESIMD_DNS::get_num_args() == 1 && @@ -2613,7 +2613,7 @@ __ESIMD_API std::enable_if_t && __ESIMD_NS::simd> lsc_atomic_update(T *p, Toffset offset, __ESIMD_NS::simd src0, __ESIMD_NS::simd_mask pred = 1) { - return lsc_atomic_update( + return lsc_atomic_update( p, __ESIMD_NS::simd(offset), src0, pred); } @@ -2626,7 +2626,7 @@ lsc_atomic_update(T *p, Toffset offset, __ESIMD_NS::simd src0, /// @tparam N is the number of channels (platform dependent). /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @param p is the base pointer. /// @param offsets is the zero-based offsets. /// @param src0 is the first atomic operand (expected value). @@ -2635,33 +2635,33 @@ lsc_atomic_update(T *p, Toffset offset, __ESIMD_NS::simd src0, /// template <__ESIMD_NS::atomic_op Op, typename T, int N, lsc_data_size DS = lsc_data_size::default_size, - cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, + cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, typename Toffset> __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 2, __ESIMD_NS::simd> lsc_atomic_update(T *p, __ESIMD_NS::simd offsets, __ESIMD_NS::simd src0, __ESIMD_NS::simd src1, __ESIMD_NS::simd_mask pred) { - return __ESIMD_DNS::atomic_update_impl( + return __ESIMD_DNS::atomic_update_impl( p, offsets, src0, src1, pred); } template <__ESIMD_NS::atomic_op Op, typename T, int N, lsc_data_size DS = lsc_data_size::default_size, - cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, + cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, typename OffsetObjT, typename RegionTy> __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 2, __ESIMD_NS::simd> lsc_atomic_update(T *p, __ESIMD_NS::simd_view offsets, __ESIMD_NS::simd src0, __ESIMD_NS::simd src1, __ESIMD_NS::simd_mask pred = 1) { - return lsc_atomic_update(p, offsets.read(), src0, + return lsc_atomic_update(p, offsets.read(), src0, src1, pred); } template <__ESIMD_NS::atomic_op Op, typename T, int N, lsc_data_size DS = lsc_data_size::default_size, - cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, + cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, typename Toffset> __ESIMD_API std::enable_if_t && __ESIMD_DNS::get_num_args() == 2, @@ -2669,7 +2669,7 @@ __ESIMD_API std::enable_if_t && lsc_atomic_update(T *p, Toffset offset, __ESIMD_NS::simd src0, __ESIMD_NS::simd src1, __ESIMD_NS::simd_mask pred = 1) { - return lsc_atomic_update( + return lsc_atomic_update( p, __ESIMD_NS::simd(offset), src0, src1, pred); } @@ -2682,7 +2682,7 @@ lsc_atomic_update(T *p, Toffset offset, __ESIMD_NS::simd src0, /// @tparam N is the number of channels (platform dependent). /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam AccessorTy is the \ref sycl::accessor type. /// @param acc is the SYCL accessor. /// @param offsets is the zero-based offsets. @@ -2692,7 +2692,7 @@ lsc_atomic_update(T *p, Toffset offset, __ESIMD_NS::simd src0, /// update. template <__ESIMD_NS::atomic_op Op, typename T, int N, lsc_data_size DS = lsc_data_size::default_size, - cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, + cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, typename AccessorTy, typename Toffset> __ESIMD_API std::enable_if_t< __ESIMD_DNS::is_device_accessor_with_v< @@ -2703,7 +2703,7 @@ __ESIMD_API std::enable_if_t< __ESIMD_NS::simd> lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, __ESIMD_NS::simd_mask pred) { - return __ESIMD_DNS::atomic_update_impl(acc, offsets, + return __ESIMD_DNS::atomic_update_impl(acc, offsets, pred); } @@ -2714,7 +2714,7 @@ lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, /// @tparam N is the number of channels (platform dependent). /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam AccessorTy is the \ref sycl::accessor type. /// @param acc is the SYCL accessor. /// @param offsets is the zero-based offsets. @@ -2724,7 +2724,7 @@ lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, /// update. template <__ESIMD_NS::atomic_op Op, typename T, int N, lsc_data_size DS = lsc_data_size::default_size, - cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, + cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, typename AccessorTy> __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_rw_local_accessor_v, __ESIMD_NS::simd> @@ -2743,7 +2743,7 @@ lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, /// @tparam N is the number of channels (platform dependent). /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam AccessorTy is the \ref sycl::accessor type. /// @param acc is the SYCL accessor. /// @param offsets is the zero-based offsets. @@ -2754,13 +2754,13 @@ lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, /// update. template <__ESIMD_NS::atomic_op Op, typename T, int N, lsc_data_size DS = lsc_data_size::default_size, - cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, + cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, typename AccessorTy, typename Toffset> __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_rw_device_accessor_v, __ESIMD_NS::simd> lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, __ESIMD_NS::simd src0, __ESIMD_NS::simd_mask pred) { - return __ESIMD_DNS::atomic_update_impl(acc, offsets, + return __ESIMD_DNS::atomic_update_impl(acc, offsets, src0, pred); } @@ -2771,7 +2771,7 @@ lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, /// @tparam N is the number of channels (platform dependent). /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam AccessorTy is the \ref sycl::accessor type. /// @param acc is the SYCL accessor. /// @param offsets is the zero-based offsets. @@ -2782,7 +2782,7 @@ lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, /// update. template <__ESIMD_NS::atomic_op Op, typename T, int N, lsc_data_size DS = lsc_data_size::default_size, - cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, + cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, typename AccessorTy> __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_rw_local_accessor_v, __ESIMD_NS::simd> @@ -2801,7 +2801,7 @@ lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, /// @tparam N is the number of channels (platform dependent). /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam AccessorTy is the \ref sycl::accessor type. /// @param acc is the SYCL accessor. /// @param offsets is the zero-based offsets. @@ -2813,7 +2813,7 @@ lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, /// update. template <__ESIMD_NS::atomic_op Op, typename T, int N, lsc_data_size DS = lsc_data_size::default_size, - cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, + cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, typename AccessorTy, typename Toffset> __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_rw_device_accessor_v, __ESIMD_NS::simd> @@ -2821,7 +2821,7 @@ lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, __ESIMD_NS::simd src0, __ESIMD_NS::simd src1, __ESIMD_NS::simd_mask pred) { #ifdef __ESIMD_FORCE_STATELESS_MEM - return lsc_atomic_update( + return lsc_atomic_update( __ESIMD_DNS::accessorToPointer(acc), offsets, src0, src1, pred); #else static_assert(std::is_integral_v && sizeof(Toffset) == 4, @@ -2829,7 +2829,7 @@ lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, detail::check_lsc_vector_size<1>(); detail::check_lsc_data_size(); __ESIMD_DNS::check_atomic(); - detail::check_lsc_cache_hint(); + detail::check_lsc_cache_hint(); constexpr uint16_t _AddressScale = 1; constexpr int _ImmOffset = 0; constexpr lsc_data_size _DS = @@ -2843,7 +2843,7 @@ lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, __ESIMD_NS::simd Msg_data1 = detail::lsc_format_input(src1); auto si = __ESIMD_NS::get_surface_index(acc); __ESIMD_NS::simd Tmp = - __esimd_lsc_xatomic_bti_2( pred.data(), offsets.data(), Msg_data0.data(), Msg_data1.data(), si); return detail::lsc_format_ret(Tmp); @@ -2857,7 +2857,7 @@ lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, /// @tparam N is the number of channels (platform dependent). /// @tparam DS is the data size. /// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. +/// @tparam L2H is L2 cache hint. /// @tparam AccessorTy is the \ref sycl::accessor type. /// @param acc is the SYCL accessor. /// @param offsets is the zero-based offsets. @@ -2869,7 +2869,7 @@ lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, /// update. template <__ESIMD_NS::atomic_op Op, typename T, int N, lsc_data_size DS = lsc_data_size::default_size, - cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, + cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, typename AccessorTy> __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_rw_local_accessor_v, __ESIMD_NS::simd> diff --git a/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_load_prefetch_2d.hpp b/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_load_prefetch_2d.hpp index 5857bce003f89..42c437700c0c1 100644 --- a/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_load_prefetch_2d.hpp +++ b/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_load_prefetch_2d.hpp @@ -20,7 +20,7 @@ using namespace sycl::ext::intel::experimental::esimd; template bool test(unsigned SurfaceWidth, unsigned SurfaceHeight, unsigned SurfacePitch, int X, int Y) { @@ -90,14 +90,14 @@ bool test(unsigned SurfaceWidth, unsigned SurfaceHeight, unsigned SurfacePitch, simd vals; if constexpr (use_prefetch) { - lsc_prefetch_2d( + lsc_prefetch_2d( in + off, width, height, pitch, X, Y); vals = lsc_load_2d(in + off, width, height, pitch, X, Y); } else { vals = lsc_load_2d( + Transposed, Transformed, L1H, L2H>( in + off, width, height, pitch, X, Y); } diff --git a/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_store_2d.hpp b/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_store_2d.hpp index 3264a16ffc2df..ee553fd12c33b 100644 --- a/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_store_2d.hpp +++ b/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_store_2d.hpp @@ -53,7 +53,7 @@ template (), - cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none> + cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none> bool test(unsigned SurfaceWidth, unsigned SurfaceHeight, unsigned SurfacePitch, int X, int Y) { @@ -89,7 +89,7 @@ bool test(unsigned SurfaceWidth, unsigned SurfaceHeight, unsigned SurfacePitch, simd vals(new_val + off, 1); // IUT - lsc_store_2d( + lsc_store_2d( out + off, SurfaceWidth * sizeof(T) - 1, SurfaceHeight - 1, SurfacePitch * sizeof(T) - 1, X, Y, vals); }); diff --git a/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_surf_load.hpp b/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_surf_load.hpp index 97ad1a68dfadd..54359e458793a 100644 --- a/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_surf_load.hpp +++ b/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_surf_load.hpp @@ -20,7 +20,7 @@ using namespace sycl::ext::intel::experimental::esimd; template < int case_num, typename T, uint32_t Groups, uint32_t Threads, uint16_t VL, uint16_t VS, bool transpose, lsc_data_size DS = lsc_data_size::default_size, - cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, + cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, bool use_prefetch = false, typename Flags = __ESIMD_NS::overaligned_tag<4>> bool test(uint32_t pmask = 0xffffffff) { static_assert((VL == 1) || !transpose, "Transpose must have exec size 1"); @@ -87,19 +87,19 @@ bool test(uint32_t pmask = 0xffffffff) { if constexpr (transpose) { simd vals; if constexpr (use_prefetch) { - lsc_prefetch(acci, byte_off); + lsc_prefetch(acci, byte_off); if constexpr (sizeof(T) < 8) { - vals = lsc_block_load(acci, byte_off, + vals = lsc_block_load(acci, byte_off, Flags{}); } else { - vals = lsc_block_load(acci, byte_off); + vals = lsc_block_load(acci, byte_off); } } else { if constexpr (sizeof(T) < 8) { - vals = lsc_block_load(acci, byte_off, + vals = lsc_block_load(acci, byte_off, Flags{}); } else { - vals = lsc_block_load(acci, byte_off); + vals = lsc_block_load(acci, byte_off); } } if constexpr (sizeof(T) < 8) { @@ -115,11 +115,11 @@ bool test(uint32_t pmask = 0xffffffff) { simd vals; if constexpr (use_prefetch) { - lsc_prefetch(acci, offset, pred); + lsc_prefetch(acci, offset, pred); vals = lsc_gather(acci, offset, pred); } else { - vals = lsc_gather(acci, offset, pred); + vals = lsc_gather(acci, offset, pred); } if constexpr (DS == lsc_data_size::u8u32 || diff --git a/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_surf_store.hpp b/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_surf_store.hpp index 0331db4ba8a73..ea118012f07f5 100644 --- a/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_surf_store.hpp +++ b/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_surf_store.hpp @@ -20,7 +20,7 @@ using namespace sycl::ext::intel::experimental::esimd; template > bool test(uint32_t pmask = 0xffffffff) { static_assert((VL == 1) || !transpose, "Transpose must have exec size 1"); @@ -82,10 +82,10 @@ bool test(uint32_t pmask = 0xffffffff) { if constexpr (transpose) { simd vals(new_val + elem_off, 1); if constexpr (sizeof(T) < 8) { - lsc_block_store(acco, byte_off, vals, + lsc_block_store(acco, byte_off, vals, Flags{}); } else { - lsc_block_store(acco, byte_off, vals); + lsc_block_store(acco, byte_off, vals); } } else { simd offset(byte_off, VS * sizeof(T)); @@ -99,7 +99,7 @@ bool test(uint32_t pmask = 0xffffffff) { for (int j = 0; j < VS; j++) vals.template select<1, 1>(i + j * VL) = val++; - lsc_scatter(acco, offset, vals, pred); + lsc_scatter(acco, offset, vals, pred); } }); }); diff --git a/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_usm_block_load_prefetch.hpp b/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_usm_block_load_prefetch.hpp index bab1dc2991894..f82e0d0dc52c5 100644 --- a/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_usm_block_load_prefetch.hpp +++ b/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_usm_block_load_prefetch.hpp @@ -20,7 +20,7 @@ using namespace sycl::ext::intel::experimental::esimd; template > bool test(queue Q, uint32_t Groups, uint32_t Threads) { @@ -63,7 +63,7 @@ bool test(queue Q, uint32_t Groups, uint32_t Threads) { simd_mask<1> Mask = GlobalID % 1; if constexpr (UsePrefetch) { - lsc_prefetch(In + ElemOffset); + lsc_prefetch(In + ElemOffset); if constexpr (sizeof(T) < 8) { Vals = lsc_block_load(In + ElemOffset, Mask, OldValues, Flags{}); @@ -72,16 +72,16 @@ bool test(queue Q, uint32_t Groups, uint32_t Threads) { } } else { if constexpr (sizeof(T) < 8) { - Vals = lsc_block_load(In + ElemOffset, Mask, + Vals = lsc_block_load(In + ElemOffset, Mask, OldValues, Flags{}); } else { - Vals = lsc_block_load(In + ElemOffset, Mask, + Vals = lsc_block_load(In + ElemOffset, Mask, OldValues); } } } else { if constexpr (UsePrefetch) { - lsc_prefetch(In + ElemOffset); + lsc_prefetch(In + ElemOffset); if constexpr (sizeof(T) < 8) { Vals = lsc_block_load(In + ElemOffset, Flags{}); } else { @@ -90,9 +90,9 @@ bool test(queue Q, uint32_t Groups, uint32_t Threads) { } else { if constexpr (sizeof(T) < 8) { Vals = - lsc_block_load(In + ElemOffset, Flags{}); + lsc_block_load(In + ElemOffset, Flags{}); } else { - Vals = lsc_block_load(In + ElemOffset); + Vals = lsc_block_load(In + ElemOffset); } } } @@ -138,7 +138,7 @@ bool test(queue Q, uint32_t Groups, uint32_t Threads) { template bool test_lsc_block_load() { constexpr lsc_data_size DS = lsc_data_size::default_size; constexpr cache_hint L1H = cache_hint::none; - constexpr cache_hint L3H = cache_hint::none; + constexpr cache_hint L2H = cache_hint::none; constexpr bool NoPrefetch = false; constexpr bool CheckMerge = true; @@ -150,52 +150,52 @@ template bool test_lsc_block_load() { << Q.get_device().get_info() << std::endl; bool Passed = true; - Passed &= test(Q, 1, 4); - Passed &= test(Q, 1, 4); - Passed &= test(Q, 2, 2); - Passed &= test(Q, 2, 8); - Passed &= test(Q, 3, 3); + Passed &= test(Q, 1, 4); + Passed &= test(Q, 1, 4); + Passed &= test(Q, 2, 2); + Passed &= test(Q, 2, 8); + Passed &= test(Q, 3, 3); if constexpr (sizeof(T) * 2 >= sizeof(int)) - Passed &= test(Q, 5, 5); + Passed &= test(Q, 5, 5); if constexpr (sizeof(T) >= sizeof(int)) - Passed &= test(Q, 3, 5); + Passed &= test(Q, 3, 5); if constexpr (sizeof(T) <= 4) { - Passed &= test>(Q, 1, 4); - Passed &= test>(Q, 1, 4); if constexpr (sizeof(T) == 2) { - Passed &= test>(Q, 1, 4); - Passed &= test>(Q, 1, 4); } if constexpr (sizeof(T) == 1) { - Passed &= test>(Q, 1, 4); - Passed &= test>(Q, 1, 4); } } - Passed &= test(Q, 1, 4); - Passed &= test(Q, 2, 2); - Passed &= test(Q, 4, 4); - Passed &= test(Q, 2, 8); - Passed &= test(Q, 3, 3); + Passed &= test(Q, 1, 4); + Passed &= test(Q, 2, 2); + Passed &= test(Q, 4, 4); + Passed &= test(Q, 2, 8); + Passed &= test(Q, 3, 3); if constexpr (sizeof(T) * 2 >= sizeof(int)) - Passed &= test(Q, 5, 5); + Passed &= test(Q, 5, 5); if constexpr (sizeof(T) >= sizeof(int)) - Passed &= test(Q, 3, 5); + Passed &= test(Q, 3, 5); // Only 512-bits maximum can be loaded at once (i.e. 4*128 bytes). if constexpr (sizeof(T) <= 4) - Passed &= test>(Q, 1, 4); if constexpr (sizeof(T) <= 2) - Passed &= test>(Q, 1, 4); if constexpr (sizeof(T) == 1) - Passed &= test>(Q, 1, 4); return Passed; @@ -205,7 +205,7 @@ template std::enable_if_t test_lsc_prefetch() { constexpr cache_hint L1H = cache_hint::cached; - constexpr cache_hint L3H = cache_hint::uncached; + constexpr cache_hint L2H = cache_hint::uncached; constexpr bool DoPrefetch = true; auto Q = queue{gpu_selector_v}; @@ -214,15 +214,15 @@ std::enable_if_t test_lsc_prefetch() { << Q.get_device().get_info() << std::endl; bool Passed = true; - Passed &= test(Q, 1, 4); - Passed &= test(Q, 1, 4); - Passed &= test(Q, 2, 2); - Passed &= test(Q, 2, 8); - Passed &= test(Q, 3, 3); + Passed &= test(Q, 1, 4); + Passed &= test(Q, 1, 4); + Passed &= test(Q, 2, 2); + Passed &= test(Q, 2, 8); + Passed &= test(Q, 3, 3); if constexpr (sizeof(T) * 2 >= sizeof(int)) - Passed &= test(Q, 5, 5); + Passed &= test(Q, 5, 5); if constexpr (sizeof(T) >= sizeof(int)) - Passed &= test(Q, 3, 5); + Passed &= test(Q, 3, 5); return Passed; } diff --git a/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_usm_gather_prefetch.hpp b/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_usm_gather_prefetch.hpp index 21c1991cc7267..117b03d919f96 100644 --- a/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_usm_gather_prefetch.hpp +++ b/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_usm_gather_prefetch.hpp @@ -26,7 +26,7 @@ typedef uint32_t Toffset; template bool test(queue q, uint32_t Groups, uint32_t Threads, uint32_t pmask = 0xffffffff) { @@ -90,15 +90,15 @@ bool test(queue q, uint32_t Groups, uint32_t Threads, simd vals; if constexpr (use_prefetch) { - lsc_prefetch(in, offset, pred); + lsc_prefetch(in, offset, pred); vals = lsc_gather( in, offset, pred); } else if constexpr (!use_old_values) { - vals = lsc_gather(in, offset, pred); + vals = lsc_gather(in, offset, pred); } else { // use_old_values simd old_values = merge_value; vals = - lsc_gather(in, offset, pred, old_values); + lsc_gather(in, offset, pred, old_values); } if constexpr (DS == lsc_data_size::u8u32 || DS == lsc_data_size::u16u32) @@ -158,38 +158,38 @@ template bool test_lsc_gather_prefetch(queue q) { constexpr cache_hint L1H = cache_hint::cached; - constexpr cache_hint L3H = cache_hint::uncached; + constexpr cache_hint L2H = cache_hint::uncached; constexpr bool DoMerging = true; bool Passed = true; Passed &= - test(q, 4, 4, rand()); + test(q, 4, 4, rand()); if constexpr (!DoPrefetch) Passed &= - test(q, 4, 4, rand()); + test(q, 4, 4, rand()); #ifndef USE_SCALAR_OFFSET // These tests use lsc_scatter with scalar offset when USE_SCALAR_OFFSET macro // is set, which is UB and thus guarded by the macro here. - Passed &= test(q, 1, 4, rand()); - Passed &= test(q, 2, 4, rand()); - Passed &= test(q, 2, 2, rand()); - Passed &= test(q, 4, 2, rand()); - Passed &= test(q, 4, 16, rand()); + Passed &= test(q, 1, 4, rand()); + Passed &= test(q, 2, 4, rand()); + Passed &= test(q, 2, 2, rand()); + Passed &= test(q, 4, 2, rand()); + Passed &= test(q, 4, 16, rand()); // The next block of tests is only for gather with merging semantics, // not for prefetch tests. if constexpr (!DoPrefetch) { Passed &= - test(q, 1, 4, rand()); + test(q, 1, 4, rand()); Passed &= - test(q, 4, 16, rand()); + test(q, 4, 16, rand()); } if constexpr (((DS == lsc_data_size::default_size && sizeof(T) >= 4) || DS == lsc_data_size::u32 || DS == lsc_data_size::u32) && !DoPrefetch) { - Passed &= test(q, 2, 4, rand()); + Passed &= test(q, 2, 4, rand()); } #endif // !USE_SCALAR_OFFSET diff --git a/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_usm_store.hpp b/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_usm_store.hpp index 4765152b644c8..e7864d6401835 100644 --- a/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_usm_store.hpp +++ b/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_usm_store.hpp @@ -26,7 +26,7 @@ typedef uint32_t Toffset; template > bool test(uint32_t pmask = 0xffffffff) { static_assert((VL == 1) || !transpose, "Transpose must have exec size 1"); @@ -86,10 +86,10 @@ bool test(uint32_t pmask = 0xffffffff) { if constexpr (transpose) { simd vals(new_val + elem_off, 1); if constexpr (sizeof(T) < 8) { - lsc_block_store(out + elem_off, vals, + lsc_block_store(out + elem_off, vals, Flags{}); } else { - lsc_block_store(out + elem_off, vals); + lsc_block_store(out + elem_off, vals); } } else { simd offset(byte_off, VS * sizeof(T)); @@ -103,7 +103,7 @@ bool test(uint32_t pmask = 0xffffffff) { for (int j = 0; j < VS; j++) vals.template select<1, 1>(i + j * VL) = val++; - lsc_scatter(out, offset, vals, pred); + lsc_scatter(out, offset, vals, pred); } }); }); diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u16.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u16.cpp index 65d5afe54c541..92b60cfe4a4bd 100644 --- a/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u16.cpp +++ b/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u16.cpp @@ -15,30 +15,30 @@ constexpr uint32_t seed = 322; using T = uint16_t; constexpr cache_hint L1H = cache_hint::cached; -constexpr cache_hint L3H = cache_hint::uncached; +constexpr cache_hint L2H = cache_hint::uncached; int main(void) { srand(seed); bool passed = true; // non transposed, non transformed - passed &= test<1, T, 1, 1, 16, 32, 1, false, false, L1H, L3H, true>( + passed &= test<1, T, 1, 1, 16, 32, 1, false, false, L1H, L2H, true>( 24, 64, 64, 6, 21); passed &= - test<2, T, 2, 2, 8, 4, 1, false, false, L1H, L3H, true>(16, 16, 32, 2, 5); + test<2, T, 2, 2, 8, 4, 1, false, false, L1H, L2H, true>(16, 16, 32, 2, 5); passed &= - test<3, T, 1, 1, 8, 4, 2, false, false, L1H, L3H, true>(16, 7, 32, 4, 1); + test<3, T, 1, 1, 8, 4, 2, false, false, L1H, L2H, true>(16, 7, 32, 4, 1); // transformed passed &= - test<4, T, 1, 1, 8, 4, 4, false, true, L1H, L3H, true>(16, 10, 32, 6, 5); - passed &= test<5, T, 1, 1, 6, 10, 1, false, true, L1H, L3H, true>(18, 10, 32, + test<4, T, 1, 1, 8, 4, 4, false, true, L1H, L2H, true>(16, 10, 32, 6, 5); + passed &= test<5, T, 1, 1, 6, 10, 1, false, true, L1H, L2H, true>(18, 10, 32, 12, 0); passed &= - test<6, T, 1, 1, 16, 2, 2, false, true, L1H, L3H, true>(32, 4, 32, 4, 1); + test<6, T, 1, 1, 16, 2, 2, false, true, L1H, L2H, true>(32, 4, 32, 4, 1); passed &= - test<7, T, 2, 2, 2, 16, 2, false, true, L1H, L3H, true>(4, 20, 32, 0, 3); - passed &= test<8, T, 1, 1, 16, 32, 1, false, true, L1H, L3H, true>(24, 50, 32, + test<7, T, 2, 2, 2, 16, 2, false, true, L1H, L2H, true>(4, 20, 32, 0, 3); + passed &= test<8, T, 1, 1, 16, 32, 1, false, true, L1H, L2H, true>(24, 50, 32, 4, 14); std::cout << (passed ? "Passed\n" : "FAILED\n"); diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u32.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u32.cpp index 1c1b4717c8944..6dfcf035c652f 100644 --- a/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u32.cpp +++ b/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u32.cpp @@ -15,29 +15,29 @@ constexpr uint32_t seed = 322; using T = uint32_t; constexpr cache_hint L1H = cache_hint::cached; -constexpr cache_hint L3H = cache_hint::uncached; +constexpr cache_hint L2H = cache_hint::uncached; int main(void) { srand(seed); bool passed = true; // non transposed, non transformed - passed &= test<1, T, 1, 1, 16, 4, 1, false, false, L1H, L3H, true>(16, 16, 32, + passed &= test<1, T, 1, 1, 16, 4, 1, false, false, L1H, L2H, true>(16, 16, 32, 2, 1); passed &= - test<2, T, 2, 2, 8, 4, 1, false, false, L1H, L3H, true>(16, 16, 16, 1, 5); + test<2, T, 2, 2, 8, 4, 1, false, false, L1H, L2H, true>(16, 16, 16, 1, 5); passed &= - test<3, T, 1, 1, 8, 2, 2, false, false, L1H, L3H, true>(16, 4, 16, 3, 1); + test<3, T, 1, 1, 8, 2, 2, false, false, L1H, L2H, true>(16, 4, 16, 3, 1); // transposed passed &= - test<4, T, 1, 1, 1, 16, 1, true, false, L1H, L3H, true>(16, 20, 16, 1, 2); + test<4, T, 1, 1, 1, 16, 1, true, false, L1H, L2H, true>(16, 20, 16, 1, 2); passed &= - test<5, T, 1, 1, 2, 8, 1, true, false, L1H, L3H, true>(16, 10, 16, 10, 1); + test<5, T, 1, 1, 2, 8, 1, true, false, L1H, L2H, true>(16, 10, 16, 10, 1); passed &= - test<6, T, 1, 1, 4, 8, 1, true, false, L1H, L3H, true>(16, 10, 16, 11, 1); + test<6, T, 1, 1, 4, 8, 1, true, false, L1H, L2H, true>(16, 10, 16, 11, 1); passed &= - test<7, T, 2, 2, 8, 2, 1, true, false, L1H, L3H, true>(16, 4, 16, 1, 1); + test<7, T, 2, 2, 8, 2, 1, true, false, L1H, L2H, true>(16, 4, 16, 1, 1); std::cout << (passed ? "Passed\n" : "FAILED\n"); return passed ? 0 : 1; diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u64.cpp index e691e1bb61e78..59bc2bd51497b 100644 --- a/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u64.cpp +++ b/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u64.cpp @@ -15,7 +15,7 @@ constexpr uint32_t seed = 322; using T = uint64_t; constexpr cache_hint L1H = cache_hint::cached; -constexpr cache_hint L3H = cache_hint::uncached; +constexpr cache_hint L2H = cache_hint::uncached; int main(void) { srand(seed); @@ -23,21 +23,21 @@ int main(void) { // non transposed, non transformed passed &= - test<1, T, 1, 1, 8, 32, 1, false, false, L1H, L3H, true>(8, 32, 8, 0, 0); + test<1, T, 1, 1, 8, 32, 1, false, false, L1H, L2H, true>(8, 32, 8, 0, 0); passed &= - test<2, T, 2, 2, 8, 4, 1, false, false, L1H, L3H, true>(16, 16, 16, 1, 5); + test<2, T, 2, 2, 8, 4, 1, false, false, L1H, L2H, true>(16, 16, 16, 1, 5); passed &= - test<3, T, 1, 1, 4, 2, 1, false, false, L1H, L3H, true>(16, 4, 16, 3, 1); + test<3, T, 1, 1, 4, 2, 1, false, false, L1H, L2H, true>(16, 4, 16, 3, 1); // transposed passed &= - test<4, T, 1, 1, 1, 8, 1, true, false, L1H, L3H, true>(16, 10, 16, 1, 2); + test<4, T, 1, 1, 1, 8, 1, true, false, L1H, L2H, true>(16, 10, 16, 1, 2); passed &= - test<5, T, 1, 1, 2, 8, 1, true, false, L1H, L3H, true>(16, 10, 16, 10, 1); + test<5, T, 1, 1, 2, 8, 1, true, false, L1H, L2H, true>(16, 10, 16, 10, 1); passed &= - test<6, T, 1, 1, 4, 8, 1, true, false, L1H, L3H, true>(16, 10, 16, 11, 1); + test<6, T, 1, 1, 4, 8, 1, true, false, L1H, L2H, true>(16, 10, 16, 11, 1); passed &= - test<7, T, 2, 2, 4, 8, 1, true, false, L1H, L3H, true>(16, 9, 16, 1, 1); + test<7, T, 2, 2, 4, 8, 1, true, false, L1H, L2H, true>(16, 9, 16, 1, 1); std::cout << (passed ? "Passed\n" : "FAILED\n"); return passed ? 0 : 1; diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u8.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u8.cpp index 27e0361a9d0d2..cf703ffa84db7 100644 --- a/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u8.cpp +++ b/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u8.cpp @@ -15,30 +15,30 @@ constexpr uint32_t seed = 322; using T = uint8_t; constexpr cache_hint L1H = cache_hint::cached; -constexpr cache_hint L3H = cache_hint::uncached; +constexpr cache_hint L2H = cache_hint::uncached; int main(void) { srand(seed); bool passed = true; // non transposed, non transformed - passed &= test<1, T, 1, 1, 16, 32, 2, false, false, L1H, L3H, true>( + passed &= test<1, T, 1, 1, 16, 32, 2, false, false, L1H, L2H, true>( 40, 64, 64, 4, 21); passed &= - test<2, T, 2, 2, 8, 8, 2, false, false, L1H, L3H, true>(16, 16, 64, 8, 5); - passed &= test<3, T, 1, 1, 8, 32, 2, false, false, L1H, L3H, true>(16, 80, 64, + test<2, T, 2, 2, 8, 8, 2, false, false, L1H, L2H, true>(16, 16, 64, 8, 5); + passed &= test<3, T, 1, 1, 8, 32, 2, false, false, L1H, L2H, true>(16, 80, 64, 4, 1); // transformed - passed &= test<4, T, 1, 1, 16, 4, 4, false, true, L1H, L3H, true>(100, 10, + passed &= test<4, T, 1, 1, 16, 4, 4, false, true, L1H, L2H, true>(100, 10, 128, 16, 5); - passed &= test<5, T, 1, 1, 12, 20, 1, false, true, L1H, L3H, true>(16, 40, 64, + passed &= test<5, T, 1, 1, 12, 20, 1, false, true, L1H, L2H, true>(16, 40, 64, 0, 0); passed &= - test<6, T, 1, 1, 16, 4, 2, false, true, L1H, L3H, true>(32, 4, 64, 4, 1); + test<6, T, 1, 1, 16, 4, 2, false, true, L1H, L2H, true>(32, 4, 64, 4, 1); passed &= - test<7, T, 2, 2, 4, 16, 2, false, true, L1H, L3H, true>(4, 20, 64, 0, 3); - passed &= test<8, T, 1, 1, 16, 32, 1, false, true, L1H, L3H, true>(24, 80, 64, + test<7, T, 2, 2, 4, 16, 2, false, true, L1H, L2H, true>(4, 20, 64, 0, 3); + passed &= test<8, T, 1, 1, 16, 32, 1, false, true, L1H, L2H, true>(24, 80, 64, 4, 14); std::cout << (passed ? "Passed\n" : "FAILED\n"); diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u16u32.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u16u32.cpp index e011932e8f596..15b71f20bc52f 100644 --- a/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u16u32.cpp +++ b/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u16u32.cpp @@ -15,19 +15,19 @@ constexpr uint32_t seed = 197; constexpr lsc_data_size DS = lsc_data_size::u16u32; constexpr cache_hint L1H = cache_hint::cached; -constexpr cache_hint L3H = cache_hint::uncached; +constexpr cache_hint L2H = cache_hint::uncached; int main(void) { srand(seed); bool passed = true; // non-transpose - passed &= test<0, uint32_t, 1, 1, 1, 1, false, DS, L1H, L3H, true>(rand()); - passed &= test<1, uint32_t, 1, 4, 32, 1, false, DS, L1H, L3H, true>(rand()); - passed &= test<2, uint32_t, 2, 4, 16, 1, false, DS, L1H, L3H, true>(rand()); - passed &= test<3, uint32_t, 2, 2, 8, 1, false, DS, L1H, L3H, true>(rand()); - passed &= test<4, uint32_t, 4, 2, 4, 1, false, DS, L1H, L3H, true>(rand()); - passed &= test<5, uint32_t, 4, 16, 2, 1, false, DS, L1H, L3H, true>(rand()); + passed &= test<0, uint32_t, 1, 1, 1, 1, false, DS, L1H, L2H, true>(rand()); + passed &= test<1, uint32_t, 1, 4, 32, 1, false, DS, L1H, L2H, true>(rand()); + passed &= test<2, uint32_t, 2, 4, 16, 1, false, DS, L1H, L2H, true>(rand()); + passed &= test<3, uint32_t, 2, 2, 8, 1, false, DS, L1H, L2H, true>(rand()); + passed &= test<4, uint32_t, 4, 2, 4, 1, false, DS, L1H, L2H, true>(rand()); + passed &= test<5, uint32_t, 4, 16, 2, 1, false, DS, L1H, L2H, true>(rand()); std::cout << (passed ? "Passed\n" : "FAILED\n"); return passed ? 0 : 1; diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u32.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u32.cpp index 4b027d696727c..152f7f657adb6 100644 --- a/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u32.cpp +++ b/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u32.cpp @@ -15,26 +15,26 @@ constexpr uint32_t seed = 199; constexpr lsc_data_size DS = lsc_data_size::u32; constexpr cache_hint L1H = cache_hint::cached; -constexpr cache_hint L3H = cache_hint::uncached; +constexpr cache_hint L2H = cache_hint::uncached; template bool tests() { bool passed = true; // non transpose passed &= - test(rand()); + test(rand()); passed &= - test(rand()); + test(rand()); passed &= - test(rand()); + test(rand()); passed &= - test(rand()); - passed &= test(1); - passed &= test(1); + test(rand()); + passed &= test(1); + passed &= test(1); // transpose - passed &= test(); - passed &= test(); - passed &= test(); + passed &= test(); + passed &= test(); + passed &= test(); return passed; } diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u64.cpp index 84a14bad846d0..170bc500e885d 100644 --- a/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u64.cpp +++ b/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u64.cpp @@ -15,26 +15,26 @@ constexpr uint32_t seed = 198; constexpr lsc_data_size DS = lsc_data_size::u64; constexpr cache_hint L1H = cache_hint::cached; -constexpr cache_hint L3H = cache_hint::uncached; +constexpr cache_hint L2H = cache_hint::uncached; template bool tests() { bool passed = true; // non transpose passed &= - test(rand()); + test(rand()); passed &= - test(rand()); + test(rand()); passed &= - test(rand()); + test(rand()); passed &= - test(rand()); - passed &= test(1); - passed &= test(1); + test(rand()); + passed &= test(1); + passed &= test(1); // transpose - passed &= test(); - passed &= test(); - passed &= test(); + passed &= test(); + passed &= test(); + passed &= test(); return passed; } diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u8u32.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u8u32.cpp index 4b7db2784163a..15721094223a1 100644 --- a/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u8u32.cpp +++ b/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u8u32.cpp @@ -15,19 +15,19 @@ constexpr uint32_t seed = 196; constexpr lsc_data_size DS = lsc_data_size::u8u32; constexpr cache_hint L1H = cache_hint::cached; -constexpr cache_hint L3H = cache_hint::uncached; +constexpr cache_hint L2H = cache_hint::uncached; int main(void) { srand(seed); bool passed = true; // non-transpose - passed &= test<0, uint32_t, 1, 1, 1, 1, false, DS, L1H, L3H, true>(rand()); - passed &= test<1, uint32_t, 1, 4, 32, 1, false, DS, L1H, L3H, true>(rand()); - passed &= test<2, uint32_t, 2, 4, 16, 1, false, DS, L1H, L3H, true>(rand()); - passed &= test<3, uint32_t, 2, 2, 8, 1, false, DS, L1H, L3H, true>(rand()); - passed &= test<4, uint32_t, 4, 2, 4, 1, false, DS, L1H, L3H, true>(rand()); - passed &= test<5, uint32_t, 4, 16, 2, 1, false, DS, L1H, L3H, true>(rand()); + passed &= test<0, uint32_t, 1, 1, 1, 1, false, DS, L1H, L2H, true>(rand()); + passed &= test<1, uint32_t, 1, 4, 32, 1, false, DS, L1H, L2H, true>(rand()); + passed &= test<2, uint32_t, 2, 4, 16, 1, false, DS, L1H, L2H, true>(rand()); + passed &= test<3, uint32_t, 2, 2, 8, 1, false, DS, L1H, L2H, true>(rand()); + passed &= test<4, uint32_t, 4, 2, 4, 1, false, DS, L1H, L2H, true>(rand()); + passed &= test<5, uint32_t, 4, 16, 2, 1, false, DS, L1H, L2H, true>(rand()); std::cout << (passed ? "Passed\n" : "FAILED\n"); return passed ? 0 : 1; diff --git a/sycl/test/esimd/check_lsc.cpp b/sycl/test/esimd/check_lsc.cpp index 4a211b7a7c2d9..e3f7ba939a7c4 100644 --- a/sycl/test/esimd/check_lsc.cpp +++ b/sycl/test/esimd/check_lsc.cpp @@ -18,23 +18,23 @@ using namespace sycl; // --- Postive tests. template ()> SYCL_EXTERNAL auto test_load(T *ptr, int width, int height, int pitch) SYCL_ESIMD_FUNCTION { return lsc_load_2d( + TRANSFORM, L1H, L2H>( ptr, width * sizeof(T) - 1, height - 1, pitch * sizeof(T) - 1, 0, 0); } template ()> SYCL_EXTERNAL void test_store(T *ptr, simd v, int width, int height, int pitch) SYCL_ESIMD_FUNCTION { - lsc_store_2d( + lsc_store_2d( ptr, width * sizeof(T) - 1, height - 1, pitch * sizeof(T) - 1, 0, 0, v); }