From fdf7745213a6fe47fa7b1bbd169937b4e0a75dcd Mon Sep 17 00:00:00 2001 From: "Klochkov, Vyacheslav N" Date: Tue, 5 Mar 2024 14:19:00 -0800 Subject: [PATCH] [ESIMD][NFC] Rework L1/L2 cache hints passing across internal funcs(part2) gather_impl(), scatter_impl(), atomic_update_impl(), prefetch_impl() now accept a list of properties that may include L1/L2 cache-hints instead of L1/L2 template parameters. Signed-off-by: Klochkov, Vyacheslav N --- llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp | 4 - sycl/include/sycl/ext/intel/esimd/common.hpp | 43 +- .../ext/intel/esimd/detail/memory_intrin.hpp | 27 -- sycl/include/sycl/ext/intel/esimd/memory.hpp | 440 +++++++----------- .../ext/intel/esimd/memory_properties.hpp | 8 + .../ext/intel/experimental/esimd/common.hpp | 13 +- .../ext/intel/experimental/esimd/memory.hpp | 88 ++-- sycl/test/esimd/lsc.cpp | 6 +- 8 files changed, 249 insertions(+), 380 deletions(-) diff --git a/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp b/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp index 7916011de0d53..34376fc693e0c 100644 --- a/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp +++ b/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp @@ -512,10 +512,6 @@ class ESIMDIntrinDescTable { {"lsc.load.merge.bti", {ai1(0), c8(lsc_subopcode::load), t8(1), t8(2), t16(3), t32(4), t8(5), t8(6), t8(7), c8(0), a(1), aSI(2), a(3)}}}, - {"lsc_load_stateless", - {"lsc.load.stateless", - {ai1(0), c8(lsc_subopcode::load), t8(1), t8(2), t16(3), t32(4), t8(5), - t8(6), t8(7), c8(0), a(1), c32(0)}}}, {"lsc_load_merge_stateless", {"lsc.load.merge.stateless", {ai1(0), c8(lsc_subopcode::load), t8(1), t8(2), t16(3), t32(4), t8(5), diff --git a/sycl/include/sycl/ext/intel/esimd/common.hpp b/sycl/include/sycl/ext/intel/esimd/common.hpp index 25f5188ad38bb..155e3615dfdc4 100644 --- a/sycl/include/sycl/ext/intel/esimd/common.hpp +++ b/sycl/include/sycl/ext/intel/esimd/common.hpp @@ -551,10 +551,25 @@ constexpr bool are_both(cache_hint First, cache_hint Second, cache_hint Val) { enum class cache_action { prefetch, load, store, atomic }; -template -void check_cache_hint() { - constexpr auto L1H = cache_hint_wrap{}; - constexpr auto L2H = cache_hint_wrap{}; +template constexpr bool has_cache_hints() { + constexpr cache_hint L1H = + getPropertyValue(cache_hint::none); + constexpr cache_hint L2H = + getPropertyValue(cache_hint::none); + return L1H != cache_hint::none || L2H != cache_hint::none; +} + +// Currently, this is just a wrapper around 'check_cache_hint' function. +// It accepts the compile-time properties that may include cache-hints +// to be verified. +template +void check_cache_hints() { + constexpr auto L1H = + cache_hint_wrap( + cache_hint::none)>{}; + constexpr auto L2H = + cache_hint_wrap( + cache_hint::none)>{}; if constexpr (Action == cache_action::prefetch) { static_assert( L1H.template is_one_of constexpr bool has_cache_hints() { - constexpr cache_hint L1H = - getPropertyValue(cache_hint::none); - constexpr cache_hint L2H = - getPropertyValue(cache_hint::none); - return L1H != cache_hint::none || L2H != cache_hint::none; -} - -// Currently, this is just a wrapper around 'check_cache_hint' function. -// It accepts the compile-time properties that may include cache-hints -// to be verified. -template -void check_cache_hints() { - constexpr cache_hint L1H = - getPropertyValue(cache_hint::none); - constexpr cache_hint L2H = - getPropertyValue(cache_hint::none); - check_cache_hint(); -} - constexpr lsc_data_size expand_data_size(lsc_data_size DS) { if (DS == lsc_data_size::u8) return lsc_data_size::u8u32; diff --git a/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp b/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp index 1b4c6d9d08b56..034eab9d8ba7a 100644 --- a/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp +++ b/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp @@ -831,33 +831,6 @@ __esimd_lsc_load_merge_stateless( __ESIMD_DNS::vector_type_t()> pass_thru = 0) __ESIMD_INTRIN_END; -/// USM pointer gather. -/// Supported platforms: DG2, PVC -/// -/// Collects elements located at specified address and returns them -/// as a single \ref simd object. -/// -/// @tparam Ty is element type. -/// @tparam L1H is L1 cache hint. -/// @tparam L2H is L2 cache hint. -/// @tparam AddressScale is the address scale. -/// @tparam ImmOffset is the immediate offset added to each address. -/// @tparam DS is the data size. -/// @tparam VS is the number of elements to load per address. -/// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the SIMD size of operation (the number of addresses to access) -/// @param pred is predicates. -/// @param addrs is the load addresses. -/// @return is a vector of type T and N * to_int() -template -__ESIMD_INTRIN __ESIMD_DNS::vector_type_t()> -__esimd_lsc_load_stateless(__ESIMD_DNS::simd_mask_storage_t pred, - __ESIMD_DNS::vector_type_t addrs) - __ESIMD_INTRIN_END; - /// USM pointer scatter. /// Supported platforms: DG2, PVC /// diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp index e72d39fbb2b65..ff242a820690a 100644 --- a/sycl/include/sycl/ext/intel/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp @@ -105,47 +105,7 @@ ESIMD_INLINE simd lsc_format_ret(simd Vals) { /// @tparam T is element type. /// @tparam NElts is the number of elements to load per address. /// @tparam DS is the data size. -/// @tparam L1H is L1 cache hint. -/// @tparam L2H is L2 cache hint. -/// @tparam N is the number of channels (platform dependent). -/// @param p is the base pointer. -/// @param offsets is the zero-based offsets in bytes. -/// @param pred is predicates. -/// @return is a vector of type T and size N * NElts -template -__ESIMD_API simd gather_impl(const T *p, simd offsets, - simd_mask pred) { - static_assert(std::is_integral_v, "Unsupported offset type"); - check_lsc_vector_size(); - check_lsc_data_size(); - check_cache_hint(); - constexpr uint16_t AddressScale = 1; - constexpr int ImmOffset = 0; - constexpr lsc_data_size EDS = expand_data_size(finalize_data_size()); - constexpr lsc_vector_size VS = to_lsc_vector_size(); - constexpr auto Transposed = lsc_data_order::nontranspose; - using MsgT = typename lsc_expand_type::type; - simd addrs = reinterpret_cast(p); - addrs += convert(offsets); - simd Tmp = - __esimd_lsc_load_stateless(pred.data(), addrs.data()); - return lsc_format_ret(Tmp); -} - -/// USM pointer gather. -/// Supported platforms: DG2, PVC -/// VISA instruction: lsc_load.ugm -/// -/// Collects elements located at specified address and returns them -/// as a single \ref simd object. -/// -/// @tparam T is element type. -/// @tparam NElts is the number of elements to load per address. -/// @tparam DS is the data size. -/// @tparam L1H is L1 cache hint. -/// @tparam L2H is L2 cache hint. +/// @tparam PropertyListT is the properties with optional cache hints. /// @tparam N is the number of channels (platform dependent). /// @param p is the base pointer. /// @param offsets is the zero-based offsets in bytes. @@ -154,15 +114,19 @@ __ESIMD_API simd gather_impl(const T *p, simd offsets, /// to the returned result when the corresponding element of \p pred is 0. /// @return is a vector of type T and size N * NElts /// -template +template __ESIMD_API simd gather_impl(const T *p, simd offsets, simd_mask pred, simd pass_thru) { static_assert(std::is_integral_v, "Unsupported offset type"); check_lsc_vector_size(); check_lsc_data_size(); - check_cache_hint(); + check_cache_hints(); + constexpr cache_hint L1H = + getPropertyValue(cache_hint::none); + constexpr cache_hint L2H = + getPropertyValue(cache_hint::none); constexpr uint16_t AddressScale = 1; constexpr int ImmOffset = 0; constexpr lsc_data_size EDS = expand_data_size(finalize_data_size()); @@ -188,22 +152,25 @@ __ESIMD_API simd gather_impl(const T *p, simd offsets, /// @tparam T is element type. /// @tparam NElts is the number of elements to store per address. /// @tparam DS is the data size. -/// @tparam L1H is L1 cache hint. -/// @tparam L2H is L2 cache hint. +/// @tparam PropertyListT is the properties with optional cache hints. /// @tparam N is the number of channels (platform dependent). /// @param p is the base pointer. /// @param offsets is the zero-based offsets in bytes. /// @param vals is values to store. /// @param pred is predicates. /// -template +template __ESIMD_API void scatter_impl(T *p, simd offsets, simd vals, simd_mask pred) { static_assert(std::is_integral_v, "Unsupported offset type"); check_lsc_vector_size(); check_lsc_data_size(); - check_cache_hint(); + check_cache_hints(); + constexpr cache_hint L1H = + getPropertyValue(cache_hint::none); + constexpr cache_hint L2H = + getPropertyValue(cache_hint::none); constexpr uint16_t AddressScale = 1; constexpr int ImmOffset = 0; constexpr lsc_data_size EDS = expand_data_size(finalize_data_size()); @@ -318,22 +285,15 @@ gather(const T *p, simd byte_offsets, simd_mask mask, detail::getPropertyValue(sizeof(T)); static_assert(Alignment >= sizeof(T), "gather() requires at least element-size alignment"); - constexpr auto L1Hint = - detail::getPropertyValue( - cache_hint::none); - constexpr auto L2Hint = - detail::getPropertyValue( - cache_hint::none); - // Use LSC lowering if L1/L2 or VS > 1. Also, if masked gather is - // not available, then LSC is the only lowering option. - if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none || - VS > 1 || !detail::isMaskedGatherScatterLLVMAvailable()) { + // Use LSC lowering if cache-hints are used or VS > 1. Also, if + // llvm.masked.gather is not available, then LSC is the only lowering option. + if constexpr (detail::has_cache_hints() || VS > 1 || + !detail::isMaskedGatherScatterLLVMAvailable()) { static_assert(VS == 1 || sizeof(T) >= 4, "VS > 1 is supprted only for 4- and 8-byte elements"); return detail::gather_impl(p, byte_offsets, mask, - pass_thru); + PropertyListT>(p, byte_offsets, mask, pass_thru); } else { simd Addrs(reinterpret_cast(p)); Addrs = Addrs + convert(byte_offsets); @@ -381,19 +341,13 @@ gather(const T *p, simd byte_offsets, simd_mask mask, detail::getPropertyValue(sizeof(T)); static_assert(Alignment >= sizeof(T), "gather() requires at least element-size alignment"); - constexpr auto L1Hint = - detail::getPropertyValue( - cache_hint::none); - constexpr auto L2Hint = - detail::getPropertyValue( - cache_hint::none); - if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none || - VS > 1 || detail::isMaskedGatherScatterLLVMAvailable()) { + if constexpr (detail::has_cache_hints() || VS > 1 || + detail::isMaskedGatherScatterLLVMAvailable() || + !detail::isPowerOf2(N, 32)) { simd PassThru; // it is intentionally undefined return gather(p, byte_offsets, mask, PassThru, props); } else { - static_assert(detail::isPowerOf2(N, 32), "Unsupported value of N"); simd Addrs = reinterpret_cast(p); Addrs += convert(byte_offsets); using MsgT = detail::__raw_t; @@ -715,22 +669,15 @@ scatter(T *p, simd byte_offsets, simd vals, detail::getPropertyValue(sizeof(T)); static_assert(Alignment >= sizeof(T), "scatter() requires at least element-size alignment"); - constexpr auto L1Hint = - detail::getPropertyValue( - cache_hint::none); - constexpr auto L2Hint = - detail::getPropertyValue( - cache_hint::none); - // Use LSC lowering if L1/L2 or VS > 1. - if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none || - VS > 1 || + // Use LSC lowering if cache-hints are used or VS > 1. + if constexpr (detail::has_cache_hints() || VS > 1 || (!__ESIMD_DNS::isPowerOf2(N, 32) && !detail::isMaskedGatherScatterLLVMAvailable())) { static_assert(VS == 1 || sizeof(T) >= 4, "VS > 1 is supprted only for 4- and 8-byte elements"); return detail::scatter_impl(p, byte_offsets, vals, mask); + PropertyListT>(p, byte_offsets, vals, mask); } else if constexpr (detail::isMaskedGatherScatterLLVMAvailable()) { simd Addrs(reinterpret_cast(p)); Addrs = Addrs + convert(byte_offsets); @@ -959,6 +906,7 @@ block_load_impl(const T *p, simd_mask<1> pred, simd pass_thru) { getPropertyValue(cache_hint::none); constexpr cache_hint L2H = getPropertyValue(cache_hint::none); + constexpr uint16_t AddressScale = 1; constexpr int ImmOffset = 0; constexpr lsc_data_size ActualDS = @@ -2618,8 +2566,7 @@ scatter_impl(AccessorTy acc, simd vals, simd offsets, /// @tparam T is element type. /// @tparam NElts is the number of elements to store per address. /// @tparam DS is the data size. -/// @tparam L1H is L1 cache hint. -/// @tparam L2H is L2 cache hint. +/// @tparam PropertyListT is the properties with optional cache hints. /// @tparam N is the number of channels (platform dependent). /// @tparam AccessorTy is the \ref sycl::accessor type. /// @param acc is the SYCL accessor. @@ -2627,8 +2574,8 @@ scatter_impl(AccessorTy acc, simd vals, simd offsets, /// @param vals is values to store. /// @param pred is predicates. /// -template +template __ESIMD_API std::enable_if_t< is_device_accessor_with_v> scatter_impl(AccessorTy acc, simd offsets, simd vals, @@ -2641,7 +2588,11 @@ scatter_impl(AccessorTy acc, simd offsets, simd vals, "convert offsets to a 32-bit vector"); check_lsc_vector_size(); check_lsc_data_size(); - check_cache_hint(); + check_cache_hints(); + constexpr cache_hint L1H = + getPropertyValue(cache_hint::none); + constexpr cache_hint L2H = + getPropertyValue(cache_hint::none); constexpr uint16_t AddressScale = 1; constexpr int ImmOffset = 0; constexpr lsc_data_size EDS = expand_data_size(finalize_data_size()); @@ -2713,8 +2664,8 @@ gather_impl(AccessorTy acc, simd offsets, uint32_t glob_offset, } #ifndef __ESIMD_FORCE_STATELESS_MEM -template +template __ESIMD_API std::enable_if_t< is_device_accessor_with_v, simd> @@ -2731,13 +2682,17 @@ gather_impl(AccessorT acc, simd byte_offsets, "VS > 1 is supprted only for 4- and 8-byte elements"); check_lsc_vector_size(); check_lsc_data_size(); - check_cache_hint(); + check_cache_hints(); constexpr uint16_t AddressScale = 1; constexpr int ImmOffset = 0; constexpr lsc_data_size EDS = expand_data_size(finalize_data_size()); constexpr lsc_vector_size LSCVS = to_lsc_vector_size(); constexpr auto Transposed = lsc_data_order::nontranspose; using MsgT = typename lsc_expand_type::type; + constexpr cache_hint L1H = + getPropertyValue(cache_hint::none); + constexpr cache_hint L2H = + getPropertyValue(cache_hint::none); auto SI = get_surface_index(acc); simd ByteOffsets32 = convert(byte_offsets); simd PassThruExpanded = lsc_format_input(pass_thru); @@ -2827,21 +2782,24 @@ __ESIMD_API void slm_scatter_impl(simd offsets, /// @tparam T is element type. /// @tparam NElts is the number of elements to load per address. /// @tparam DS is the data size. -/// @tparam L1H is L1 cache hint. -/// @tparam L2H is L2 cache hint. +/// @tparam PropertyListT is the properties with optional cache hints. /// @tparam N is the number of channels (platform dependent). /// @param p is the base pointer. /// @param byte_offsets is the zero-based offsets in bytes. /// @param pred is predicates. /// -template +template __ESIMD_API void prefetch_impl(const T *p, simd byte_offsets, simd_mask pred) { static_assert(std::is_integral_v, "Unsupported offset type"); check_lsc_vector_size(); check_lsc_data_size(); - check_cache_hint(); + check_cache_hints(); + constexpr cache_hint L1H = + getPropertyValue(cache_hint::none); + constexpr cache_hint L2H = + getPropertyValue(cache_hint::none); constexpr uint16_t AddressScale = 1; constexpr int ImmOffset = 0; constexpr lsc_data_size EDS = expand_data_size(finalize_data_size()); @@ -2855,13 +2813,17 @@ __ESIMD_API void prefetch_impl(const T *p, simd byte_offsets, addrs.data()); } -template +template __ESIMD_API std::enable_if_t> prefetch_impl(const T *p, Toffset offset, simd_mask<1> pred) { check_lsc_vector_size(); check_lsc_data_size(); - check_cache_hint(); + check_cache_hints(); + constexpr cache_hint L1H = + getPropertyValue(cache_hint::none); + constexpr cache_hint L2H = + getPropertyValue(cache_hint::none); constexpr uint16_t AddressScale = 1; constexpr int ImmOffset = 0; constexpr lsc_data_size EDS = finalize_data_size(); @@ -2889,8 +2851,7 @@ prefetch_impl(const T *p, Toffset offset, simd_mask<1> pred) { /// @tparam T is element type. /// @tparam NElts is the number of elements to load per address. /// @tparam DS is the data size. -/// @tparam L1H is L1 cache hint. -/// @tparam L2H is L2 cache hint. +/// @tparam PropertyListT is the properties with optional cache hints. /// @tparam N is the number of channels (platform dependent). /// @tparam AccessorTy is the \ref sycl::accessor type. /// @tparam OffsetT is the type of \c byte_offsets. @@ -2899,8 +2860,8 @@ prefetch_impl(const T *p, Toffset offset, simd_mask<1> pred) { /// @param pred is predicates. /// -template +template __ESIMD_API std::enable_if_t< is_device_accessor_with_v> prefetch_impl(AccessorTy acc, simd byte_offsets, @@ -2913,7 +2874,11 @@ prefetch_impl(AccessorTy acc, simd byte_offsets, "convert offsets to a 32-bit vector"); check_lsc_vector_size(); check_lsc_data_size(); - check_cache_hint(); + check_cache_hints(); + constexpr cache_hint L1H = + getPropertyValue(cache_hint::none); + constexpr cache_hint L2H = + getPropertyValue(cache_hint::none); constexpr uint16_t AddressScale = 1; constexpr int ImmOffset = 0; constexpr lsc_data_size EDS = expand_data_size(finalize_data_size()); @@ -2936,15 +2901,16 @@ prefetch_impl(AccessorTy acc, simd byte_offsets, /// @tparam T is element type. /// @tparam NElts is the number of elements to load per address. /// @tparam DS is the data size. -/// @tparam L1H is L1 cache hint. -/// @tparam L2H is L2 cache hint. +/// @tparam PropertyListT is the properties with optional cache hints. /// @tparam AccessorTy is the \ref sycl::accessor type. /// @tparam OffsetT is the type of \c byte_offset. /// @param acc is the SYCL accessor. /// @param byte_offset is the zero-based offset in bytes. +/// @param pred is operation predicate. Zero means operation is skipped +/// entirely, non-zero - operation is performed. /// -template +template __ESIMD_API std::enable_if_t< std::is_integral_v && is_device_accessor_with_v> @@ -2955,7 +2921,11 @@ prefetch_impl(AccessorTy acc, OffsetT byte_offset, simd_mask<1> pred) { "convert offsets to a 32-bit vector"); check_lsc_vector_size(); check_lsc_data_size(); - check_cache_hint(); + check_cache_hints(); + constexpr cache_hint L1H = + getPropertyValue(cache_hint::none); + constexpr cache_hint L2H = + getPropertyValue(cache_hint::none); constexpr uint16_t AddressScale = 1; constexpr int ImmOffset = 0; constexpr lsc_data_size EDS = finalize_data_size(); @@ -3020,7 +2990,8 @@ __ESIMD_API // Requires DG2 or PVC. simd PassThru; // Intentionally undefined byte_offsets += glob_offset; - return detail::gather_impl( acc, byte_offsets, mask, PassThru); } else { @@ -3151,17 +3122,7 @@ gather(AccessorT acc, simd byte_offsets, return gather(detail::accessorToPointer(acc), byte_offsets, mask, pass_thru, props); #else - constexpr auto L1Hint = - detail::getPropertyValue( - cache_hint::none); - constexpr auto L2Hint = - detail::getPropertyValue( - cache_hint::none); - static_assert(!PropertyListT::template has_property(), - "L3 cache hint is reserved. The old/experimental L3 LSC cache " - "hint is cache_level::L2 now."); - - return detail::gather_impl( acc, byte_offsets, mask, pass_thru); #endif // __ESIMD_FORCE_STATELESS_MEM @@ -3214,20 +3175,11 @@ gather(AccessorT acc, simd byte_offsets, detail::getPropertyValue(sizeof(T)); static_assert(Alignment >= sizeof(T), "gather() requires at least element-size alignment"); - constexpr auto L1Hint = - detail::getPropertyValue( - cache_hint::none); - constexpr auto L2Hint = - detail::getPropertyValue( - cache_hint::none); - static_assert(!PropertyListT::template has_property(), - "L3 cache hint is reserved. The old/experimental L3 LSC cache " - "hint is cache_level::L2 now."); - if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none || - VS > 1 || !(detail::isPowerOf2(N, 32))) { + if constexpr (detail::has_cache_hints() || VS > 1 || + !(detail::isPowerOf2(N, 32))) { simd PassThru; // Intentionally undefined - return detail::gather_impl( acc, byte_offsets, mask, PassThru); } else { @@ -3481,20 +3433,11 @@ scatter(AccessorTy acc, simd byte_offsets, simd vals, detail::getPropertyValue(sizeof(T)); static_assert(Alignment >= sizeof(T), "gather() requires at least element-size alignment"); - constexpr auto L1Hint = - detail::getPropertyValue( - cache_hint::none); - constexpr auto L2Hint = - detail::getPropertyValue( - cache_hint::none); - static_assert(!PropertyListT::template has_property(), - "L3 cache hint is reserved. The old/experimental L3 LSC cache " - "hint is cache_level::L2 now."); - if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none || - VS > 1 || !detail::isPowerOf2(N, 32)) { - detail::scatter_impl(acc, byte_offsets, vals, mask); + if constexpr (detail::has_cache_hints() || VS > 1 || + !detail::isPowerOf2(N, 32)) { + detail::scatter_impl(acc, byte_offsets, vals, mask); } else { detail::scatter_impl(acc, vals, byte_offsets, 0, mask); } @@ -5896,21 +5839,24 @@ namespace detail { /// @tparam T is element type. /// @tparam N is the number of channels (platform dependent). /// @tparam DS is the data size. -/// @tparam L1H is L1 cache hint. -/// @tparam L2H is L2 cache hint. +/// @tparam PropertyListT is the properties with optional cache hints. /// @param p is the base pointer. /// @param offsets is the zero-based offsets. /// @param pred is predicates. /// -template +template __ESIMD_API std::enable_if_t() == 0, simd> atomic_update_impl(T *p, simd offsets, simd_mask pred) { static_assert(sizeof(T) > 1, "Unsupported data type"); static_assert(std::is_integral_v, "Unsupported offset type"); check_atomic(); check_lsc_data_size(); - check_cache_hint(); + check_cache_hints(); + constexpr cache_hint L1H = + getPropertyValue(cache_hint::none); + constexpr cache_hint L2H = + getPropertyValue(cache_hint::none); constexpr uint16_t AddressScale = 1; constexpr int ImmOffset = 0; constexpr lsc_data_size EDS = expand_data_size(finalize_data_size()); @@ -5935,15 +5881,14 @@ atomic_update_impl(T *p, simd offsets, simd_mask pred) { /// @tparam T is element type. /// @tparam N is the number of channels (platform dependent). /// @tparam DS is the data size. -/// @tparam L1H is L1 cache hint. -/// @tparam L2H is L2 cache hint. +/// @tparam PropertyListT is the properties with optional cache hints. /// @param p is the base pointer. /// @param offsets is the zero-based offsets. /// @param src0 is the first atomic operand. /// @param pred is predicates. /// -template +template __ESIMD_API std::enable_if_t() == 1, simd> atomic_update_impl(T *p, simd offsets, simd src0, simd_mask pred) { @@ -5951,7 +5896,11 @@ atomic_update_impl(T *p, simd offsets, simd src0, static_assert(std::is_integral_v, "Unsupported offset type"); check_lsc_data_size(); check_atomic(); - check_cache_hint(); + check_cache_hints(); + constexpr cache_hint L1H = + getPropertyValue(cache_hint::none); + constexpr cache_hint L2H = + getPropertyValue(cache_hint::none); constexpr uint16_t AddressScale = 1; constexpr int ImmOffset = 0; constexpr lsc_data_size EDS = expand_data_size(finalize_data_size()); @@ -5977,16 +5926,15 @@ atomic_update_impl(T *p, simd offsets, simd src0, /// @tparam T is element type. /// @tparam N is the number of channels (platform dependent). /// @tparam DS is the data size. -/// @tparam L1H is L1 cache hint. -/// @tparam L2H is L2 cache hint. +/// @tparam PropertyListT is the properties with optional cache hints. /// @param p is the base pointer. /// @param offsets is the zero-based offsets. /// @param src0 is the first atomic operand (expected value). /// @param src1 is the second atomic operand (new value). /// @param pred predicates. /// -template +template __ESIMD_API std::enable_if_t() == 2, simd> atomic_update_impl(T *p, simd offsets, simd src0, simd src1, simd_mask pred) { @@ -5994,7 +5942,11 @@ atomic_update_impl(T *p, simd offsets, simd src0, static_assert(std::is_integral_v, "Unsupported offset type"); check_lsc_data_size(); check_atomic(); - check_cache_hint(); + check_cache_hints(); + constexpr cache_hint L1H = + getPropertyValue(cache_hint::none); + constexpr cache_hint L2H = + getPropertyValue(cache_hint::none); constexpr uint16_t AddressScale = 1; constexpr int ImmOffset = 0; constexpr lsc_data_size EDS = expand_data_size(finalize_data_size()); @@ -6021,8 +5973,7 @@ atomic_update_impl(T *p, simd offsets, simd src0, /// @tparam T is element type. /// @tparam N is the number of channels (platform dependent). /// @tparam DS is the data size. -/// @tparam L1H is L1 cache hint. -/// @tparam L2H is L2 cache hint. +/// @tparam PropertyListT is the properties with optional cache hints. /// @tparam AccessorTy is the \ref sycl::accessor type. /// @param acc is the SYCL accessor. /// @param byte_offsets is the zero-based offsets. @@ -6031,8 +5982,7 @@ atomic_update_impl(T *p, simd offsets, simd src0, /// update. template + typename PropertyListT, typename AccessorTy, typename Toffset> __ESIMD_API std::enable_if_t() == 0 && __ESIMD_DNS::is_rw_device_accessor_v, @@ -6040,15 +5990,19 @@ __ESIMD_API atomic_update_impl(AccessorTy acc, simd byte_offsets, simd_mask pred) { #ifdef __ESIMD_FORCE_STATELESS_MEM - return atomic_update_impl(accessorToPointer(acc), - byte_offsets, pred); + return atomic_update_impl( + accessorToPointer(acc), byte_offsets, pred); #else static_assert(sizeof(T) > 1, "Unsupported data type"); static_assert(std::is_integral_v && sizeof(Toffset) == 4, "Unsupported offset type"); check_lsc_data_size(); check_atomic(); - check_cache_hint(); + check_cache_hints(); + constexpr cache_hint L1H = + getPropertyValue(cache_hint::none); + constexpr cache_hint L2H = + getPropertyValue(cache_hint::none); constexpr uint16_t AddressScale = 1; constexpr int ImmOffset = 0; constexpr lsc_data_size EDS = expand_data_size(finalize_data_size()); @@ -6073,8 +6027,7 @@ __ESIMD_API /// @tparam T is element type. /// @tparam N is the number of channels (platform dependent). /// @tparam DS is the data size. -/// @tparam L1H is L1 cache hint. -/// @tparam L2H is L2 cache hint. +/// @tparam PropertyListT is the properties with optional cache hints. /// @tparam AccessorTy is the \ref sycl::accessor type. /// @param acc is the SYCL accessor. /// @param byte_offset is the zero-based offsets. @@ -6083,8 +6036,8 @@ __ESIMD_API /// /// @return A vector of the old values at the memory locations before the /// update. -template +template __ESIMD_API std::enable_if_t() == 1 && __ESIMD_DNS::is_rw_device_accessor_v, @@ -6092,15 +6045,19 @@ __ESIMD_API atomic_update_impl(AccessorTy acc, simd byte_offset, simd src0, simd_mask pred) { #ifdef __ESIMD_FORCE_STATELESS_MEM - return atomic_update_impl(accessorToPointer(acc), - byte_offset, src0, pred); + return atomic_update_impl( + accessorToPointer(acc), byte_offset, src0, pred); #else static_assert(sizeof(T) > 1, "Unsupported data type"); static_assert(std::is_integral_v && sizeof(Toffset) == 4, "Unsupported offset type"); check_lsc_data_size(); check_atomic(); - check_cache_hint(); + check_cache_hints(); + constexpr cache_hint L1H = + getPropertyValue(cache_hint::none); + constexpr cache_hint L2H = + getPropertyValue(cache_hint::none); constexpr uint16_t AddressScale = 1; constexpr int ImmOffset = 0; constexpr lsc_data_size EDS = expand_data_size(finalize_data_size()); @@ -6126,8 +6083,7 @@ __ESIMD_API /// @tparam T is element type. /// @tparam N is the number of channels (platform dependent). /// @tparam DS is the data size. -/// @tparam L1H is L1 cache hint. -/// @tparam L2H is L2 cache hint. +/// @tparam PropertyListT is the properties with optional cache hints. /// @tparam AccessorTy is the \ref sycl::accessor type. /// @param acc is the SYCL accessor. /// @param byte_offset is the zero-based offsets. @@ -6137,8 +6093,8 @@ __ESIMD_API /// /// @return A vector of the old values at the memory locations before the /// update. -template +template __ESIMD_API std::enable_if_t() == 2 && __ESIMD_DNS::is_rw_device_accessor_v, @@ -6146,7 +6102,7 @@ __ESIMD_API atomic_update_impl(AccessorTy acc, simd byte_offset, simd src0, simd src1, simd_mask pred) { #ifdef __ESIMD_FORCE_STATELESS_MEM - return atomic_update_impl( + return atomic_update_impl( __ESIMD_DNS::accessorToPointer(acc), byte_offset, src0, src1, pred); #else static_assert(std::is_integral_v && sizeof(Toffset) == 4, @@ -6154,7 +6110,11 @@ __ESIMD_API check_lsc_vector_size<1>(); check_lsc_data_size(); check_atomic(); - check_cache_hint(); + check_cache_hints(); + constexpr cache_hint L1H = + getPropertyValue(cache_hint::none); + constexpr cache_hint L2H = + getPropertyValue(cache_hint::none); constexpr uint16_t AddressScale = 1; constexpr int ImmOffset = 0; constexpr lsc_data_size EDS = expand_data_size(finalize_data_size()); @@ -6230,22 +6190,10 @@ atomic_update(T *p, simd byte_offset, simd_mask mask, PropertyListT props = {}) { static_assert(std::is_integral_v, "Unsupported offset type"); - constexpr auto L1Hint = - detail::getPropertyValue( - cache_hint::none); - - constexpr auto L2Hint = - detail::getPropertyValue( - cache_hint::none); - - static_assert(!PropertyListT::template has_property(), - "L3 cache hint is reserved. The old/experimental L3 LSC cache " - "hint is cache_level::L2 now."); - - if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none || + if constexpr (detail::has_cache_hints() || !__ESIMD_DNS::isPowerOf2(N, 32)) { return detail::atomic_update_impl< - Op, T, N, detail::lsc_data_size::default_size, L1Hint, L2Hint, Toffset>( + Op, T, N, detail::lsc_data_size::default_size, PropertyListT, Toffset>( p, byte_offset, mask); } else if constexpr (N == 16 || N == 32) { // TODO: In fact GPU BE supports legalization for any N, even for @@ -6463,25 +6411,13 @@ atomic_update(T *p, simd byte_offset, simd src0, simd_mask mask, PropertyListT props = {}) { static_assert(std::is_integral_v, "Unsupported offset type"); - constexpr auto L1Hint = - detail::getPropertyValue( - cache_hint::none); - - constexpr auto L2Hint = - detail::getPropertyValue( - cache_hint::none); - - static_assert(!PropertyListT::template has_property(), - "L3 cache hint is reserved. The old/experimental L3 LSC cache " - "hint is cache_level::L2 now."); - // Auto-convert FP atomics to LSC version. - if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none || + if constexpr (detail::has_cache_hints() || (Op == atomic_op::fmin) || (Op == atomic_op::fmax) || (Op == atomic_op::fadd) || (Op == atomic_op::fsub) || !__ESIMD_DNS::isPowerOf2(N, 32)) { return detail::atomic_update_impl< - Op, T, N, detail::lsc_data_size::default_size, L1Hint, L2Hint, Toffset>( + Op, T, N, detail::lsc_data_size::default_size, PropertyListT, Toffset>( p, byte_offset, src0, mask); } else if constexpr (N == 16 || N == 32) { // TODO: In fact GPU BE supports legalization for any N, even for @@ -6726,27 +6662,15 @@ atomic_update(T *p, simd byte_offset, simd src0, simd src1, simd_mask mask, PropertyListT props = {}) { static_assert(std::is_integral_v, "Unsupported offset type"); - constexpr auto L1Hint = - detail::getPropertyValue( - cache_hint::none); - - constexpr auto L2Hint = - detail::getPropertyValue( - cache_hint::none); - - static_assert(!PropertyListT::template has_property(), - "L3 cache hint is reserved. The old/experimental L3 LSC cache " - "hint is cache_level::L2 now."); - // Use LSC atomic when cache hints are present, FP atomics is used, // non-power of two length is used, or operation width greater than 32. - if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none || + if constexpr (detail::has_cache_hints() || Op == atomic_op::fcmpxchg || !__ESIMD_DNS::isPowerOf2(N, 32)) { // 2-argument lsc_atomic_update arguments order matches the standard one - // expected value first, then new value. But atomic_update uses reverse // order, hence the src1/src0 swap. return detail::atomic_update_impl< - Op, T, N, detail::lsc_data_size::default_size, L1Hint, L2Hint, Toffset>( + Op, T, N, detail::lsc_data_size::default_size, PropertyListT, Toffset>( p, byte_offset, src1, src0, mask); } else if constexpr (N == 16 || N == 32) { // TODO: In fact GPU BE supports legalization for any N, even for @@ -6977,10 +6901,10 @@ atomic_update(AccessorTy acc, simd byte_offset, simd_mask mask, static_assert(std::is_integral_v, "Unsupported offset type"); - if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none || + if constexpr (detail::has_cache_hints() || !detail::isPowerOf2(N, 32)) { return detail::atomic_update_impl< - Op, T, N, detail::lsc_data_size::default_size, L1Hint, L2Hint>( + Op, T, N, detail::lsc_data_size::default_size, PropertyListT>( acc, byte_offset, mask); } else { if constexpr (Op == atomic_op::load) { @@ -7238,26 +7162,15 @@ atomic_update(AccessorTy acc, simd byte_offset, simd src0, return atomic_update(__ESIMD_DNS::accessorToPointer(acc), byte_offset, src0, mask, props); #else - constexpr auto L1Hint = - detail::getPropertyValue( - cache_hint::none); - - constexpr auto L2Hint = - detail::getPropertyValue( - cache_hint::none); - - static_assert(!PropertyListT::template has_property(), - "L3 cache hint is reserved. The old/experimental L3 LSC cache " - "hint is cache_level::L2 now."); static_assert(std::is_integral_v, "Unsupported offset type"); static_assert(sizeof(Toffset) == 4, "Only 32 bit offset is supported"); // Auto-convert FP atomics to LSC version. - if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none || + if constexpr (detail::has_cache_hints() || Op == atomic_op::fmin || Op == atomic_op::fmax || Op == atomic_op::fadd || Op == atomic_op::fsub || !__ESIMD_DNS::isPowerOf2(N, 32)) { return detail::atomic_update_impl< - Op, T, N, detail::lsc_data_size::default_size, L1Hint, L2Hint>( + Op, T, N, detail::lsc_data_size::default_size, PropertyListT>( acc, byte_offset, src0, mask); } else if constexpr (Op == atomic_op::store) { if constexpr (std::is_integral_v) { @@ -7547,28 +7460,17 @@ atomic_update(AccessorTy acc, simd byte_offset, simd src0, return atomic_update(__ESIMD_DNS::accessorToPointer(acc), byte_offset, src0, src1, mask, props); #else - constexpr auto L1Hint = - detail::getPropertyValue( - cache_hint::none); - - constexpr auto L2Hint = - detail::getPropertyValue( - cache_hint::none); - - static_assert(!PropertyListT::template has_property(), - "L3 cache hint is reserved. The old/experimental L3 LSC cache " - "hint is cache_level::L2 now."); static_assert(std::is_integral_v, "Unsupported offset type"); static_assert(sizeof(Toffset) == 4, "Only 32 bit offset is supported"); // Use LSC atomic when cache hints are present, FP atomics is used, // non-power of two length is used, or operation width greater than 32. - if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none || + if constexpr (detail::has_cache_hints() || Op == atomic_op::fcmpxchg || !__ESIMD_DNS::isPowerOf2(N, 32)) { // 2-argument lsc_atomic_update arguments order matches the standard one - // expected value first, then new value. But atomic_update uses reverse // order, hence the src1/src0 swap. return detail::atomic_update_impl< - Op, T, N, detail::lsc_data_size::default_size, L1Hint, L2Hint>( + Op, T, N, detail::lsc_data_size::default_size, PropertyListT>( acc, byte_offset, src1, src0, mask); } else { detail::check_atomic(); @@ -8592,15 +8494,8 @@ __ESIMD_API std::enable_if_t< prefetch(const T *p, simd byte_offsets, simd_mask mask, PropertyListT props = {}) { static_assert(N / VS >= 1 && N % VS == 0, "N must be divisible by VS"); - - constexpr auto L1Hint = - detail::getPropertyValue( - cache_hint::none); - constexpr auto L2Hint = - detail::getPropertyValue( - cache_hint::none); - detail::prefetch_impl(p, byte_offsets, mask); + detail::prefetch_impl(p, byte_offsets, mask); } /// template > prefetch(const T *p, OffsetT byte_offset, simd_mask<1> mask, PropertyListT props = {}) { - constexpr auto L1Hint = - detail::getPropertyValue( - cache_hint::none); - constexpr auto L2Hint = - detail::getPropertyValue( - cache_hint::none); - detail::prefetch_impl(p, byte_offset, mask); + detail::prefetch_impl(p, byte_offset, mask); } /// template byte_offsets, props); #else static_assert(N / VS >= 1 && N % VS == 0, "N must be divisible by VS"); - - constexpr auto L1Hint = - detail::getPropertyValue( - cache_hint::none); - constexpr auto L2Hint = - detail::getPropertyValue( - cache_hint::none); - detail::prefetch_impl(acc, byte_offsets, mask); + detail::prefetch_impl(acc, byte_offsets, mask); #endif // __ESIMD_FORCE_STATELESS_MEM } @@ -9128,14 +9010,8 @@ prefetch(AccessorT acc, OffsetT byte_offset, simd_mask<1> mask, #ifdef __ESIMD_FORCE_STATELESS_MEM prefetch(detail::accessorToPointer(acc), byte_offset, mask, props); #else - constexpr auto L1Hint = - detail::getPropertyValue( - cache_hint::none); - constexpr auto L2Hint = - detail::getPropertyValue( - cache_hint::none); - detail::prefetch_impl(acc, byte_offset, mask); + detail::prefetch_impl(acc, byte_offset, mask); #endif // __ESIMD_FORCE_STATELESS_MEM } diff --git a/sycl/include/sycl/ext/intel/esimd/memory_properties.hpp b/sycl/include/sycl/ext/intel/esimd/memory_properties.hpp index 5b2f920914466..f6b02133d7d93 100644 --- a/sycl/include/sycl/ext/intel/esimd/memory_properties.hpp +++ b/sycl/include/sycl/ext/intel/esimd/memory_properties.hpp @@ -241,6 +241,14 @@ template using make_L1_L2_alignment_properties_t = typename make_L1_L2_alignment_properties::type; +// Creates the type for the list of L1 and L2 properties. +template struct make_L1_L2_properties { + using type = ext::oneapi::experimental::detail::properties_t< + cache_hint_L1_key::value_t, cache_hint_L2_key::value_t>; +}; +template +using make_L1_L2_properties_t = typename make_L1_L2_properties::type; + } // namespace detail } // namespace ext::intel::esimd diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp index a71981de54bec..50467d44692e5 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp @@ -104,20 +104,9 @@ template struct lsc_expand_type { } // namespace detail -/// L1 or L3 cache hint kinds. +/// L1 or L2 cache hint kinds. using cache_hint = __ESIMD_NS::cache_hint; -namespace detail { -// TODO: These enum and the function are kept here temporarily, while used -// inside this header file here. Remove it after all experimental functions -// working with cache hints are moved out of experimental namespace. -using lsc_action = __ESIMD_DNS::cache_action; -template -constexpr void check_lsc_cache_hint() { - __ESIMD_DNS::check_cache_hint(); -} -} // namespace detail - /// Represents a split barrier action. enum class split_barrier_action : uint8_t { wait = 0, // split barrier wait diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp index 4c50b02202468..5b867462470c7 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp @@ -661,7 +661,10 @@ template lsc_gather(const T *p, __ESIMD_NS::simd offsets, __ESIMD_NS::simd_mask pred = 1) { - return __ESIMD_DNS::gather_impl(p, offsets, pred); + using PropertyListT = __ESIMD_DNS::make_L1_L2_properties_t; + __ESIMD_NS::simd PassThru; // Intentionally undefined. + return __ESIMD_DNS::gather_impl(p, offsets, pred, + PassThru); } /// USM pointer gather. @@ -692,8 +695,9 @@ __ESIMD_API __ESIMD_NS::simd lsc_gather(const T *p, __ESIMD_NS::simd offsets, __ESIMD_NS::simd_mask pred, __ESIMD_NS::simd pass_thru) { - return __ESIMD_DNS::gather_impl(p, offsets, pred, - pass_thru); + using PropertyListT = __ESIMD_DNS::make_L1_L2_properties_t; + return __ESIMD_DNS::gather_impl(p, offsets, pred, + pass_thru); } template ( reinterpret_cast(acc.get_pointer().get()), offsets, pred); #else - __ESIMD_NS::simd PassThru; // Intentionally unitialized. - return __ESIMD_DNS::gather_impl( + __ESIMD_NS::simd PassThru; // Intentionally uninitialized. + using PropertyListT = __ESIMD_DNS::make_L1_L2_properties_t; + return __ESIMD_DNS::gather_impl( acc, offsets, pred, PassThru); #endif // __ESIMD_FORCE_STATELESS_MEM } @@ -850,7 +855,8 @@ __ESIMD_API reinterpret_cast(acc.get_pointer().get()), offsets, pred, pass_thru); #else - return __ESIMD_DNS::gather_impl( + using PropertyListT = __ESIMD_DNS::make_L1_L2_properties_t; + return __ESIMD_DNS::gather_impl( acc, offsets, pred, pass_thru); #endif // __ESIMD_FORCE_STATELESS_MEM } @@ -971,8 +977,9 @@ __ESIMD_API std::enable_if_t<__ESIMD_NS::is_simd_flag_type_v, lsc_block_load(const T *p, FlagsT) { using PropertyListT = __ESIMD_DNS::make_L1_L2_alignment_properties_t< L1H, L2H, FlagsT::template alignment<__ESIMD_NS::simd>>; + __ESIMD_NS::simd PassThru; // Intentionally undefined. return __ESIMD_DNS::block_load_impl( - p, __ESIMD_NS::simd_mask<1>(1)); + p, __ESIMD_NS::simd_mask<1>(1), PassThru); } /// USM pointer transposed gather with 1 channel. @@ -1225,7 +1232,8 @@ template __ESIMD_API void lsc_prefetch(const T *p, __ESIMD_NS::simd offsets, __ESIMD_NS::simd_mask pred = 1) { - __ESIMD_DNS::prefetch_impl(p, offsets, pred); + using PropertyListT = __ESIMD_DNS::make_L1_L2_properties_t; + __ESIMD_DNS::prefetch_impl(p, offsets, pred); } template __ESIMD_API void lsc_prefetch(const T *p) { __ESIMD_NS::simd_mask<1> Mask = 1; - __ESIMD_DNS::prefetch_impl(p, 0, Mask); + using PropertyListT = __ESIMD_DNS::make_L1_L2_properties_t; + __ESIMD_DNS::prefetch_impl(p, 0, Mask); } /// Accessor-based prefetch gather. @@ -1299,7 +1308,8 @@ lsc_prefetch(AccessorTy acc, lsc_prefetch(__ESIMD_DNS::accessorToPointer(acc), offsets, pred); #else - __ESIMD_DNS::prefetch_impl(acc, offsets, pred); + using PropertyListT = __ESIMD_DNS::make_L1_L2_properties_t; + __ESIMD_DNS::prefetch_impl(acc, offsets, pred); #endif } @@ -1346,7 +1356,8 @@ lsc_prefetch(AccessorTy acc, __ESIMD_DNS::DeviceAccessorOffsetT offset) { __ESIMD_DNS::accessorToPointer(acc, offset)); #else __ESIMD_NS::simd_mask<1> Mask = 1; - __ESIMD_DNS::prefetch_impl(acc, offset, Mask); + using PropertyListT = __ESIMD_DNS::make_L1_L2_properties_t; + __ESIMD_DNS::prefetch_impl(acc, offset, Mask); #endif } @@ -1418,8 +1429,9 @@ template offsets, __ESIMD_NS::simd vals, __ESIMD_NS::simd_mask pred = 1) { - __ESIMD_DNS::scatter_impl(p, offsets, - vals, pred); + using PropertyListT = __ESIMD_DNS::make_L1_L2_properties_t; + __ESIMD_DNS::scatter_impl( + p, offsets, vals, pred); } template (__ESIMD_DNS::accessorToPointer(acc), offsets, vals, pred); #else - __ESIMD_DNS::scatter_impl(acc, offsets, vals, pred); + using PropertyListT = __ESIMD_DNS::make_L1_L2_properties_t; + __ESIMD_DNS::scatter_impl(acc, offsets, vals, + pred); #endif } @@ -1809,8 +1823,10 @@ template lsc_load_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight, unsigned SurfacePitch, int X, int Y) { + using PropertyListT = __ESIMD_DNS::make_L1_L2_properties_t; + __ESIMD_DNS::check_cache_hints<__ESIMD_DNS::cache_action::load, + PropertyListT>(); using RawT = __ESIMD_DNS::__raw_t; - detail::check_lsc_cache_hint(); detail::check_lsc_block_2d_restrictions(); @@ -1919,7 +1935,9 @@ template (); + using PropertyListT = __ESIMD_DNS::make_L1_L2_properties_t; + __ESIMD_DNS::check_cache_hints<__ESIMD_DNS::cache_action::prefetch, + PropertyListT>(); detail::check_lsc_block_2d_restrictions(); @@ -1966,7 +1984,9 @@ __ESIMD_API void lsc_store_2d(T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight, unsigned SurfacePitch, int X, int Y, __ESIMD_NS::simd Vals) { using RawT = __ESIMD_DNS::__raw_t; - detail::check_lsc_cache_hint(); + using PropertyListT = __ESIMD_DNS::make_L1_L2_properties_t; + __ESIMD_DNS::check_cache_hints<__ESIMD_DNS::cache_action::store, + PropertyListT>(); detail::check_lsc_block_2d_restrictions(); @@ -2230,7 +2250,9 @@ ESIMD_INLINE SYCL_ESIMD_FUNCTION __ESIMD_NS::simd lsc_load_2d( detail::check_lsc_block_2d_restrictions(); - detail::check_lsc_cache_hint(); + using PropertyListT = __ESIMD_DNS::make_L1_L2_properties_t; + __ESIMD_DNS::check_cache_hints<__ESIMD_DNS::cache_action::load, + PropertyListT>(); constexpr int ElemsPerDword = 4 / sizeof(T); constexpr int GRFRowSize = Transposed ? BlockHeight : Transformed ? BlockWidth * ElemsPerDword @@ -2321,7 +2343,9 @@ template ()> ESIMD_INLINE SYCL_ESIMD_FUNCTION void lsc_prefetch_2d( config_2d_mem_access &payload) { - detail::check_lsc_cache_hint(); + using PropertyListT = __ESIMD_DNS::make_L1_L2_properties_t; + __ESIMD_DNS::check_cache_hints<__ESIMD_DNS::cache_action::load, + PropertyListT>(); detail::check_lsc_block_2d_restrictions(); @@ -2368,7 +2392,9 @@ lsc_store_2d(config_2d_mem_access &payload, detail::check_lsc_block_2d_restrictions(); - detail::check_lsc_cache_hint(); + using PropertyListT = __ESIMD_DNS::make_L1_L2_properties_t; + __ESIMD_DNS::check_cache_hints<__ESIMD_DNS::cache_action::store, + PropertyListT>(); constexpr uint32_t cache_mask = detail::get_lsc_store_cache_mask() << 17; @@ -2492,7 +2518,8 @@ __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 0, __ESIMD_NS::simd> lsc_atomic_update(T *p, __ESIMD_NS::simd offsets, __ESIMD_NS::simd_mask pred) { - return __ESIMD_DNS::atomic_update_impl( + using PropertyListT = __ESIMD_DNS::make_L1_L2_properties_t; + return __ESIMD_DNS::atomic_update_impl( p, offsets, pred); } @@ -2531,7 +2558,8 @@ __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 1, __ESIMD_NS::simd> lsc_atomic_update(T *p, __ESIMD_NS::simd offsets, __ESIMD_NS::simd src0, __ESIMD_NS::simd_mask pred) { - return __ESIMD_DNS::atomic_update_impl( + using PropertyListT = __ESIMD_DNS::make_L1_L2_properties_t; + return __ESIMD_DNS::atomic_update_impl( p, offsets, src0, pred); } @@ -2589,7 +2617,8 @@ __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 2, lsc_atomic_update(T *p, __ESIMD_NS::simd offsets, __ESIMD_NS::simd src0, __ESIMD_NS::simd src1, __ESIMD_NS::simd_mask pred) { - return __ESIMD_DNS::atomic_update_impl( + using PropertyListT = __ESIMD_DNS::make_L1_L2_properties_t; + return __ESIMD_DNS::atomic_update_impl( p, offsets, src0, src1, pred); } @@ -2650,8 +2679,9 @@ __ESIMD_API std::enable_if_t< __ESIMD_NS::simd> lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, __ESIMD_NS::simd_mask pred) { - return __ESIMD_DNS::atomic_update_impl(acc, offsets, - pred); + using PropertyListT = __ESIMD_DNS::make_L1_L2_properties_t; + return __ESIMD_DNS::atomic_update_impl( + acc, offsets, pred); } /// Variant of \c lsc_atomic_update that uses \c local_accessor as a parameter. @@ -2707,8 +2737,9 @@ __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_rw_device_accessor_v, __ESIMD_NS::simd> lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, __ESIMD_NS::simd src0, __ESIMD_NS::simd_mask pred) { - return __ESIMD_DNS::atomic_update_impl(acc, offsets, - src0, pred); + using PropertyListT = __ESIMD_DNS::make_L1_L2_properties_t; + return __ESIMD_DNS::atomic_update_impl( + acc, offsets, src0, pred); } /// Variant of \c lsc_atomic_update that uses \c local_accessor as a parameter. @@ -2767,7 +2798,8 @@ __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_rw_device_accessor_v, lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd offsets, __ESIMD_NS::simd src0, __ESIMD_NS::simd src1, __ESIMD_NS::simd_mask pred) { - return __ESIMD_DNS::atomic_update_impl( + using PropertyListT = __ESIMD_DNS::make_L1_L2_properties_t; + return __ESIMD_DNS::atomic_update_impl( acc, offsets, src0, src1, pred); } diff --git a/sycl/test/esimd/lsc.cpp b/sycl/test/esimd/lsc.cpp index 4578404a093c0..bcc8e3e5bc882 100644 --- a/sycl/test/esimd/lsc.cpp +++ b/sycl/test/esimd/lsc.cpp @@ -57,7 +57,7 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void foo(AccType &acc) { // CHECK: call void @llvm.genx.lsc.store.stateless.v4i1.v4i64.v4i32(<4 x i1> {{[^)]+}}, i8 4, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0) lsc_scatter(ptr, offsets, data1); - // CHECK: call <4 x i32> @llvm.genx.lsc.load.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, i32 0) + // CHECK: call <4 x i32> @llvm.genx.lsc.load.merge.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, i32 0, <4 x i32> {{[^)]+}}) simd data3 = lsc_gather(ptr, offsets); // CHECK: call void @llvm.genx.lsc.prefetch.stateless.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 0, i8 1, i8 2, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, i32 0) @@ -84,11 +84,11 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void foo(AccType &acc) { lsc_scatter(acc, offsets, data1); // CHECK-STATEFUL: call <4 x i32> @llvm.genx.lsc.load.merge.bti.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> {{[^)]+}}) - // CHECK-STATELESS: call <4 x i32> @llvm.genx.lsc.load.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, i32 0) + // CHECK-STATELESS: call <4 x i32> @llvm.genx.lsc.load.merge.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, i32 0, <4 x i32> {{[^)]+}}) simd data5 = lsc_gather(acc, offsets); // CHECK-STATEFUL: call <4 x i32> @llvm.genx.lsc.load.merge.bti.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> {{[^)]+}}) - // CHECK-STATELESS: call <4 x i32> @llvm.genx.lsc.load.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, i32 0) + // CHECK-STATELESS: call <4 x i32> @llvm.genx.lsc.load.merge.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, i32 0, <4 x i32> {{[^)]+}}) data5 = lsc_gather(acc, offsets, mask); // CHECK-STATEFUL: call <4 x i32> @llvm.genx.lsc.load.merge.bti.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> {{[^)]+}})