diff --git a/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp b/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp index 209fd86411369..e86e2a9a9d969 100644 --- a/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp +++ b/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp @@ -645,6 +645,104 @@ __esimd_lsc_xatomic_bti_2( } #endif // __SYCL_DEVICE_ONLY__ +/// SLM atomic. +/// Supported platforms: DG2, PVC +/// +/// @tparam Ty is element type. +/// @tparam InternalOp is operation type. +/// @tparam L1H is L1 cache hint. +/// @tparam L2H is L2 cache hint. +/// @tparam AddressScale is the address scale. +/// @tparam ImmOffset is the immediate offset added to each address. +/// @tparam DS is the data size. +/// @tparam VS is the number of elements per address. +/// @tparam Transposed indicates if the data is transposed during the transfer. +/// @tparam N is the SIMD size of operation (the number of addresses to access) +/// @param pred is predicates. +/// @param offsets is the zero-based offsets. +template +__ESIMD_INTRIN __ESIMD_DNS::vector_type_t()> +__esimd_lsc_xatomic_slm_0(__ESIMD_DNS::simd_mask_storage_t pred, + __ESIMD_DNS::vector_type_t offsets) +#ifdef __SYCL_DEVICE_ONLY__ + ; +#else // __SYCL_DEVICE_ONLY__ +{ + __ESIMD_UNSUPPORTED_ON_HOST; +} +#endif // __SYCL_DEVICE_ONLY__ + +/// SLM atomic. +/// Supported platforms: DG2, PVC +/// +/// @tparam Ty is element type. +/// @tparam InternalOp is operation type. +/// @tparam L1H is L1 cache hint. +/// @tparam L2H is L2 cache hint. +/// @tparam AddressScale is the address scale. +/// @tparam ImmOffset is the immediate offset added to each address. +/// @tparam DS is the data size. +/// @tparam VS is the number of elements per address. +/// @tparam Transposed indicates if the data is transposed during the transfer. +/// @tparam N is the SIMD size of operation (the number of addresses to access) +/// @param pred is predicates. +/// @param offsets is the zero-based offsets. +/// @param src0 is the first atomic operand. +template +__ESIMD_INTRIN __ESIMD_DNS::vector_type_t()> +__esimd_lsc_xatomic_slm_1( + __ESIMD_DNS::simd_mask_storage_t pred, + __ESIMD_DNS::vector_type_t offsets, + __ESIMD_DNS::vector_type_t()> src0) +#ifdef __SYCL_DEVICE_ONLY__ + ; +#else // __SYCL_DEVICE_ONLY__ +{ + __ESIMD_UNSUPPORTED_ON_HOST; +} +#endif // __SYCL_DEVICE_ONLY__ + +/// SLM atomic. +/// Supported platforms: DG2, PVC +/// +/// @tparam Ty is element type. +/// @tparam InternalOp is operation type. +/// @tparam L1H is L1 cache hint. +/// @tparam L2H is L2 cache hint. +/// @tparam AddressScale is the address scale. +/// @tparam ImmOffset is the immediate offset added to each address. +/// @tparam DS is the data size. +/// @tparam VS is the number of elements per address. +/// @tparam Transposed indicates if the data is transposed during the transfer. +/// @tparam N is the SIMD size of operation (the number of addresses to access) +/// @param pred is predicates. +/// @param offsets is the zero-based offsets. +/// @param src0 is the first atomic operand. +/// @param src1 is the second atomic operand. +template +__ESIMD_INTRIN __ESIMD_DNS::vector_type_t()> +__esimd_lsc_xatomic_slm_2( + __ESIMD_DNS::simd_mask_storage_t pred, + __ESIMD_DNS::vector_type_t offsets, + __ESIMD_DNS::vector_type_t()> src0, + __ESIMD_DNS::vector_type_t()> src1) +#ifdef __SYCL_DEVICE_ONLY__ + ; +#else // __SYCL_DEVICE_ONLY__ +{ + __ESIMD_UNSUPPORTED_ON_HOST; +} +#endif // __SYCL_DEVICE_ONLY__ + __ESIMD_INTRIN void __esimd_slm_init(uint32_t size) #ifdef __SYCL_DEVICE_ONLY__ ; diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp index b21293f87c522..e9e70f65c294b 100644 --- a/sycl/include/sycl/ext/intel/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp @@ -3680,42 +3680,359 @@ lsc_format_ret(__ESIMD_NS::simd Vals) { } } +/// SLM atomic. +/// Supported platforms: DG2, PVC +/// VISA instruction: lsc_atomic_.slm +/// +/// @tparam Op is operation type. +/// @tparam T is element type. +/// @tparam N is the number of channels (platform dependent). +/// @tparam DS is the data size. +/// @param offsets is the zero-based offsets. +/// @param pred is predicate. +/// +/// @return A vector of the old values at the memory locations before the +/// update. + +template +__ESIMD_API std::enable_if_t() == 0, simd> +slm_atomic_update_impl(simd offsets, simd_mask pred) { + check_lsc_data_size(); + check_atomic(); + constexpr uint16_t AddressScale = 1; + constexpr int ImmOffset = 0; + constexpr lsc_data_size EDS = expand_data_size(finalize_data_size()); + constexpr lsc_vector_size VS = to_lsc_vector_size<1>(); + constexpr lsc_data_order Transposed = lsc_data_order::nontranspose; + using MsgT = typename lsc_expand_type::type; + constexpr int IOp = lsc_to_internal_atomic_op(); + simd Tmp = + __esimd_lsc_xatomic_slm_0(pred.data(), offsets.data()); + return lsc_format_ret(Tmp); +} + +/// SLM atomic. +/// Supported platforms: DG2, PVC +/// VISA instruction: lsc_atomic_.slm +/// +/// @tparam Op is operation type. +/// @tparam T is element type. +/// @tparam N is the number of channels (platform dependent). +/// @tparam DS is the data size. +/// @param offsets is the zero-based offsets. +/// @param src0 is the first atomic operand. +/// @param pred is predicate. +/// +/// @return A vector of the old values at the memory locations before the +/// update. +template +__ESIMD_API std::enable_if_t() == 1, simd> +slm_atomic_update_impl(simd offsets, simd src0, + simd_mask pred) { + check_lsc_data_size(); + check_atomic(); + constexpr uint16_t AddressScale = 1; + constexpr int ImmOffset = 0; + constexpr lsc_data_size EDS = expand_data_size(finalize_data_size()); + constexpr lsc_vector_size VS = to_lsc_vector_size<1>(); + constexpr lsc_data_order Transposed = lsc_data_order::nontranspose; + using MsgT = typename lsc_expand_type::type; + constexpr int IOp = lsc_to_internal_atomic_op(); + simd Msg_data = lsc_format_input(src0); + simd Tmp = + __esimd_lsc_xatomic_slm_1(pred.data(), offsets.data(), + Msg_data.data()); + return lsc_format_ret(Tmp); +} + +/// SLM atomic. +/// Supported platforms: DG2, PVC +/// VISA instruction: lsc_atomic_.slm +/// +/// @tparam Op is operation type. +/// @tparam T is element type. +/// @tparam N is the number of channels (platform dependent). +/// @tparam DS is the data size. +/// @param offsets is the zero-based offsets. +/// @param src0 is the first atomic operand (expected value). +/// @param src1 is the second atomic operand (new value). +/// @param pred is predicates. +/// +/// @return A vector of the old values at the memory locations before the +/// update. +template +__ESIMD_API simd slm_atomic_update_impl(simd offsets, + simd src0, simd src1, + simd_mask pred) { + check_lsc_data_size(); + check_atomic(); + constexpr uint16_t AddressScale = 1; + constexpr int ImmOffset = 0; + constexpr lsc_data_size EDS = expand_data_size(finalize_data_size()); + constexpr lsc_vector_size VS = to_lsc_vector_size<1>(); + constexpr lsc_data_order Transposed = lsc_data_order::nontranspose; + using MsgT = typename lsc_expand_type::type; + constexpr int IOp = lsc_to_internal_atomic_op(); + simd Msg_data0 = lsc_format_input(src0); + simd Msg_data1 = lsc_format_input(src1); + simd Tmp = + __esimd_lsc_xatomic_slm_2(pred.data(), offsets.data(), + Msg_data0.data(), Msg_data1.data()); + return lsc_format_ret(Tmp); +} + } // namespace detail -/// Atomic update operation performed on SLM. No source operands version. -/// See description of template and function parameters in @ref -/// usm_atomic_update0 "atomic update" operation docs. -template > -__ESIMD_API simd slm_atomic_update(simd offsets, - simd_mask mask) { - detail::check_atomic(); - const auto si = __ESIMD_NS::get_surface_index(detail::LocalAccessorMarker()); - return __esimd_dword_atomic0(mask.data(), si, offsets.data()); +/// @anchor slm_atomic_update0 +/// @brief Atomic update operation performed on SLM. +/// No-argument variant of the atomic update operation. + +/// simd +/// slm_atomic_update(simd byte_offset, +/// simd_mask mask = 1); /// (slm-au0-1) + +/// The following functions do the same work as slm_atomic_update(). They accept +/// a local accessor \p lacc and the atomic update is done from SLM associated +/// with \p lacc plus \p byte_offset applied to it. If \p byte_offset +/// is omitted, then zero offset is used. + +/// simd atomic_update(local_accessor lacc, +/// simd byte_offset, +/// simd_mask<1> pred = 1); +/// // (lacc-au0-1) + +/// Usage of cache hints or non-standard operation width N requires DG2 or PVC. + +/// simd +/// slm_atomic_update(simd byte_offset, +/// simd_mask mask = 1); /// (slm-au0-1) +/// +/// Atomically updates \c N memory locations in SLM indicated by +/// a vector of offsets, and returns a vector of old +/// values found at the memory locations before update. +/// @tparam Op The atomic operation - can be \c atomic_op::inc or +/// \c atomic_op::dec, \c atomic_op::load. +/// @tparam T The vector element type. +/// @tparam N The number of memory locations to update. +/// @param byte_offset The vector of 32-bit offsets. +/// @param mask Operation mask, only locations with non-zero in the +/// corresponding mask element are updated. +/// @return A vector of the old values at the memory locations before the +/// update. +/// +template +__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 0, simd> +slm_atomic_update(simd byte_offset, simd_mask mask = 1) { + // 2 byte, 8 byte types, non-power of two, and operations wider than 32 are + // supported only by LSC. + if constexpr (sizeof(T) == 2 || sizeof(T) == 8 || + !__ESIMD_DNS::isPowerOf2(N, 32)) { + return slm_atomic_update_impl( + byte_offset, mask); + } else if constexpr (Op == atomic_op::load) { + if constexpr (std::is_integral_v) { + return slm_atomic_update(byte_offset, + simd(0), mask); + } else { + using Tint = detail::uint_type_t; + simd Res = slm_atomic_update( + byte_offset, simd(0), mask); + return Res.template bit_cast_view(); + } + } else { + detail::check_atomic(); + const auto si = get_surface_index(detail::LocalAccessorMarker()); + return __esimd_dword_atomic0(mask.data(), si, byte_offset.data()); + } } -/// Atomic update operation performed on SLM. One source operands version. -/// See description of template and function parameters in @ref -/// usm_atomic_update1 "atomic update" operation docs. -template > -__ESIMD_API simd slm_atomic_update(simd offsets, - simd src0, simd_mask mask) { - detail::check_atomic(); - const auto si = __ESIMD_NS::get_surface_index(detail::LocalAccessorMarker()); - return __esimd_dword_atomic1(mask.data(), si, offsets.data(), - src0.data()); -} - -/// Atomic update operation performed on SLM. Two source operands version. -/// See description of template and function parameters in @ref -/// usm_atomic_update2 "atomic update" operation docs. -template > -__ESIMD_API simd slm_atomic_update(simd offsets, - simd src0, simd src1, - simd_mask mask) { - detail::check_atomic(); - const auto si = __ESIMD_NS::get_surface_index(detail::LocalAccessorMarker()); - return __esimd_dword_atomic2(mask.data(), si, offsets.data(), - src0.data(), src1.data()); +/// simd atomic_update(local_accessor lacc, +/// simd byte_offset, +/// simd_mask pred = 1); +/// // (lacc-au0-1) +/// Atomically updates \c N memory locations in SLM ssociated +/// with the local accessor \p lacc at the given \p byte_offset, +/// and returns a vector of old values found at the memory locations before +/// update. +template +__ESIMD_API std::enable_if_t< + __ESIMD_DNS::get_num_args() == 0 && + sycl::detail::acc_properties::is_local_accessor_v, + simd> +atomic_update(AccessorT lacc, simd byte_offset, + simd_mask mask = 1) { + byte_offset += detail::localAccessorToOffset(lacc); + return slm_atomic_update(byte_offset, mask); +} + +/// One argument variant of the atomic update operation. + +/// simd +/// slm_atomic_update(simd byte_offset, +/// simd src0, +/// simd_mask mask = 1); /// (slm-au1-1) +/// + +/// simd +/// atomic_update(local_accessor lacc, +/// simd byte_offset, +/// simd src0, +/// simd_mask<1> pred = 1); // (lacc-au1-1) +/// + +/// Usage of cache hints or non-standard operation width N requires DG2 or PVC. + +/// simd +/// slm_atomic_update(simd byte_offset, +/// simd src0, +/// simd_mask mask = 1) /// (slm-au1-1) +/// +/// Atomically updates \c N memory locations in SLM indicated by +/// a vector of offsets, and returns a vector of old +/// values found at the memory locations before update. +/// @tparam Op The atomic operation. +/// @tparam T The vector element type. +/// @tparam N The number of memory locations to update. +/// @param byte_offset The vector of 32-bit offsets. +/// @param src0 is the first atomic operand. +/// @param mask Operation mask, only locations with non-zero in the +/// corresponding mask element are updated. +/// @return A vector of the old values at the memory locations before the +/// update. +template +__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 1, simd> +slm_atomic_update(simd byte_offset, simd src0, + simd_mask mask = 1) { + // 2 byte, 8 byte types, non-power of two, and operations wider than 32 are + // supported only by LSC. + if constexpr (sizeof(T) == 2 || sizeof(T) == 8 || + !__ESIMD_DNS::isPowerOf2(N, 32)) { + // half and short are supported in LSC. + return slm_atomic_update_impl( + byte_offset, src0, mask); + } else if constexpr (Op == atomic_op::store) { + if constexpr (std::is_integral_v) { + return slm_atomic_update(byte_offset, src0, mask); + } else { + using Tint = detail::uint_type_t; + simd Res = slm_atomic_update( + byte_offset, src0.template bit_cast_view(), mask); + return Res.template bit_cast_view(); + } + } else { + detail::check_atomic(); + const auto si = get_surface_index(detail::LocalAccessorMarker()); + return __esimd_dword_atomic1(mask.data(), si, byte_offset.data(), + src0.data()); + } +} + +/// simd +/// atomic_update(local_accessor lacc, +/// simd byte_offset, +/// simd src0, +/// simd_mask<1> pred = 1); // (lacc-au1-1) +/// +/// Atomically updates \c N memory locations in SLM indicated by +/// local accessor \p lacc and a vector of offsets, and returns a vector of old +/// values found at the memory locations before update. +/// @tparam Op The atomic operation. +/// @tparam T The vector element type. +/// @tparam N The number of memory locations to update. +/// @param byte_offset The vector of 32-bit offsets. +/// @param src0 is the first atomic operand. +/// @param mask Operation mask, only locations with non-zero in the +/// corresponding mask element are updated. +/// @return A vector of the old values at the memory locations before the +/// update. +template +__ESIMD_API std::enable_if_t< + __ESIMD_DNS::get_num_args() == 1 && + sycl::detail::acc_properties::is_local_accessor_v, + simd> +atomic_update(AccessorT lacc, simd byte_offset, simd src0, + simd_mask mask = 1) { + byte_offset += detail::localAccessorToOffset(lacc); + return slm_atomic_update(byte_offset, src0, mask); +} + +/// Two argument variant of the atomic update operation. + +/// simd +/// slm_atomic_update(simd byte_offset, +/// simd src0, simd src1, +/// simd_mask mask = 1); /// (slm-au2-1) + +/// simd +/// atomic_update(local_accessor lacc, +/// simd byte_offset, +/// simd src0, +/// simd src1, +/// simd_mask<1> pred = 1); // (lacc-au2-1) +/// + +/// simd +/// slm_atomic_update(simd byte_offset, +/// simd src0, simd src1, +/// simd_mask mask = 1); /// (slm-au2-1) +/// Atomically updates \c N memory locations in SLM indicated by +/// a vector of offsets, and returns a vector of old +/// values found at the memory locations before update. +/// @tparam Op The atomic operation. +/// @tparam T The vector element type. +/// @tparam N The number of memory locations to update. +/// @param byte_offset The vector of 32-bit offsets. +/// @param src0 is the first atomic operand (new value). +/// @param src1 is the second atomic operand (expected value). +/// @param mask Operation mask, only locations with non-zero in the +/// corresponding mask element are updated. +/// @return A vector of the old values at the memory locations before the +/// update. +template +__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 2, simd> +slm_atomic_update(simd byte_offset, simd src0, + simd src1, simd_mask mask = 1) { + // 2 byte, 8 byte types, non-power of two, and operations wider than 32 are + // supported only by LSC. + if constexpr (sizeof(T) == 2 || sizeof(T) == 8 || + !__ESIMD_DNS::isPowerOf2(N, 32)) { + // 2-argument lsc_atomic_update arguments order matches the standard one - + // expected value first, then new value. But atomic_update uses reverse + // order, hence the src1/src0 swap. + return detail::slm_atomic_update_impl( + byte_offset, src1, src0, mask); + } else { + detail::check_atomic(); + const auto si = get_surface_index(detail::LocalAccessorMarker()); + return __esimd_dword_atomic2(mask.data(), si, byte_offset.data(), + src0.data(), src1.data()); + } +} + +/// simd +/// atomic_update(local_accessor lacc, +/// simd byte_offset, +/// simd src0, +/// simd src1, +/// simd_mask<1> pred = 1); // (lacc-au2-1) +template +__ESIMD_API std::enable_if_t< + __ESIMD_DNS::get_num_args() == 2 && + sycl::detail::acc_properties::is_local_accessor_v, + simd> +atomic_update(AccessorT lacc, simd byte_offset, simd src0, + simd src1, simd_mask mask = 1) { + byte_offset += detail::localAccessorToOffset(lacc); + return slm_atomic_update(byte_offset, src0, src1, mask); } /// @} sycl_esimd_memory_slm @@ -5489,49 +5806,6 @@ atomic_update(AccessorTy acc, simd byte_offset, return atomic_update(acc, byte_offset, mask, props); } -/// Variant of \c atomic_update that uses \c local_accessor as a parameter. -/// Atomically updates \c N memory locations represented by an accessor and -/// a vector of offsets, and returns a vector of old values found at the -/// memory locations before update. The update operation has no arguments -/// in addition to the value at the memory location. -/// -/// @tparam Op The atomic operation - can be \c atomic_op::inc, -/// \c atomic_op::dec, or \c atomic_op::load. -/// @tparam T The vector element type. -/// @tparam N The number of memory locations to update. -/// @tparam AccessorTy type of the SYCL accessor. -/// @param acc The SYCL accessor. -/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. 64-bit -/// offsets are supported only when stateless memory accesses are enforced, i.e. -/// accessor based accesses are automatically converted to stateless accesses. -/// @param mask Operation mask, only locations with non-zero in the -/// corresponding mask element are updated. -/// @return A vector of the old values at the memory locations before the -/// update. -/// -template -__ESIMD_API __ESIMD_API std::enable_if_t< - __ESIMD_DNS::get_num_args() == 0 && - sycl::detail::acc_properties::is_local_accessor_v, - simd> -atomic_update(AccessorTy acc, simd byte_offset, - simd_mask mask) { - if constexpr (Op == atomic_op::load) { - if constexpr (std::is_integral_v) { - return atomic_update(acc, byte_offset, - simd(0), mask); - } else { - using Tint = detail::uint_type_t; - simd Res = atomic_update( - acc, byte_offset, simd(0), mask); - return Res.template bit_cast_view(); - } - } else { - return slm_atomic_update( - byte_offset + __ESIMD_DNS::localAccessorToOffset(acc), mask); - } -} - /// simd /// atomic_update(AccessorT acc, simd_view byte_offset, /// simd_mask mask, props = {}); /// (acc-au0-3) @@ -5604,36 +5878,6 @@ atomic_update(AccessorTy acc, simd_view byte_offset, return atomic_update(acc, byte_offset.read(), mask, props); } -/// A variation of \c atomic_update API with \c offsets represented as -/// \c simd_view object. -/// -/// @tparam Op The atomic operation - can be \c atomic_op::inc, -/// \c atomic_op::dec, or \c atomic_op::load. -/// @tparam T The vector element type. -/// @tparam N The number of memory locations to update. -/// @tparam AccessorTy type of the SYCL accessor. -/// @param acc The SYCL accessor. -/// @param byte_offset The simd_view of 32-bit or 64-bit offsets in bytes. -/// 64-bit offsets are supported only when stateless memory accesses are -/// enforced, i.e. accessor based accesses are automatically converted to -/// stateless accesses. -/// @param mask Operation mask, only locations with non-zero in the -/// corresponding mask element are updated. -/// @return A vector of the old values at the memory locations before the -/// update. -/// -template > -__ESIMD_API std::enable_if_t< - __ESIMD_DNS::get_num_args() == 0 && std::is_integral_v && - !std::is_pointer_v && - sycl::detail::acc_properties::is_local_accessor_v, - simd> -atomic_update(AccessorTy acc, simd_view byte_offset, - simd_mask mask) { - return atomic_update(acc, byte_offset.read(), mask); -} - /// A variation of \c atomic_update API with \c offset represented as /// scalar. /// @@ -5936,56 +6180,6 @@ atomic_update(AccessorTy acc, simd byte_offset, return atomic_update(acc, byte_offset, src0.read(), mask, props); } -/// Variant of \c atomic_update that uses \c local_accessor as a parameter. -/// Atomically updates \c N memory locations represented by an accessor and -/// a vector of offsets, and returns a vector of old values found at the -/// memory locations before update. The update operation has 1 additional -/// argument. -/// -/// @tparam Op The atomic operation - can be one of the following: -/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c atomic_op::max, -/// \c atomic_op::xchg, \c atomic_op::bit_and, \c atomic_op::bit_or, -/// \c atomic_op::bit_xor, \c atomic_op::minsint, \c atomic_op::maxsint, -/// \c atomic_op::fmax, \c atomic_op::fmin, \c atomic_op::fadd, \c -/// atomic_op::fsub, \c atomic_op::store. -/// @tparam Tx The vector element type. -/// @tparam N The number of memory locations to update. -/// @tparam AccessorTy type of the SYCL accessor. -/// @param acc The SYCL accessor. -/// @param offset The vector of 32-bit or 64-bit offsets in bytes. 64-bit -/// offsets are supported only when stateless memory accesses are enforced, i.e. -/// accessor based accesses are automatically converted to stateless accesses. -/// @param src0 The additional argument. -/// @param mask Operation mask, only locations with non-zero in the -/// corresponding mask element are updated. -/// @return A vector of the old values at the memory locations before the -/// update. -/// -template -__ESIMD_API std::enable_if_t< - sycl::detail::acc_properties::is_local_accessor_v, simd> -atomic_update(AccessorTy acc, simd offset, simd src0, - simd_mask mask) { - if constexpr ((Op == atomic_op::fmin) || (Op == atomic_op::fmax) || - (Op == atomic_op::fadd) || (Op == atomic_op::fsub)) { - // Auto-convert FP atomics to LSC version. - return atomic_update(), Tx, N>(acc, offset, - src0, mask); - } else if constexpr (Op == atomic_op::store) { - if constexpr (std::is_integral_v) { - return atomic_update(acc, offset, src0, mask); - } else { - using Tint = detail::uint_type_t; - simd Res = atomic_update( - acc, offset, src0.template bit_cast_view(), mask); - return Res.template bit_cast_view(); - } - } else { - return slm_atomic_update( - offset + __ESIMD_DNS::localAccessorToOffset(acc), src0, mask); - } -} - /// simd /// atomic_update(AccessorT acc, simd_view byte_offset, /// simd src0, @@ -6023,6 +6217,7 @@ template __ESIMD_API std::enable_if_t< __ESIMD_DNS::get_num_args() == 1 && !std::is_pointer_v && + !sycl::detail::acc_properties::is_local_accessor_v && ext::oneapi::experimental::is_property_list_v, simd> atomic_update(AccessorTy acc, simd_view byte_offset, @@ -6065,6 +6260,7 @@ template __ESIMD_API std::enable_if_t< __ESIMD_DNS::get_num_args() == 1 && !std::is_pointer_v && + !sycl::detail::acc_properties::is_local_accessor_v && ext::oneapi::experimental::is_property_list_v, simd> atomic_update(AccessorTy acc, simd_view byte_offset, @@ -6414,43 +6610,6 @@ atomic_update(AccessorTy acc, simd_view byte_offset, props); } -/// Variant of \c atomic_update that uses \c local_accessor as a parameter. -/// Atomically updates \c N memory locations represented by an accessor and -/// a vector of offsets and returns a vector of old -/// values found at the memory locations before update. The update operation -/// has 2 additional arguments. -/// -/// @tparam Op The atomic operation - can be one of the following: -/// \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg. -/// @tparam Tx The vector element type. -/// @tparam N The number of memory locations to update. -/// @tparam AccessorTy type of the SYCL accessor. -/// @param acc The SYCL accessor. -/// @param offset The vector of 32-bit or 64-bit offsets in bytes. 64-bit -/// offsets are supported only when stateless memory accesses are enforced, i.e. -/// accessor based accesses are automatically converted to stateless accesses. -/// @param src0 The first additional argument (new value). -/// @param src1 The second additional argument (expected value). -/// @param mask Operation mask, only locations with non-zero in the -/// corresponding mask element are updated. -/// @return A vector of the old values at the memory locations before the -/// update. -/// -template -__ESIMD_API std::enable_if_t< - sycl::detail::acc_properties::is_local_accessor_v, simd> -atomic_update(AccessorTy acc, simd offset, simd src0, - simd src1, simd_mask mask) { - if constexpr (Op == atomic_op::fcmpxchg) { - // Auto-convert FP atomics to LSC version. - return atomic_update(), Tx, N>( - acc, offset, src0, src1, mask); - } else { - return slm_atomic_update( - offset + __ESIMD_DNS::localAccessorToOffset(acc), src0, src1, mask); - } -} - /// A variation of \c atomic_update API with \c offsets represented as /// scalar. /// diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp index f4eda3b4c98f6..36f726b38eed5 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp @@ -285,96 +285,17 @@ __esimd_lsc_store2d_stateless(__ESIMD_DNS::simd_mask_storage_t Pred, } #endif // __SYCL_DEVICE_ONLY__ -/// SLM atomic. +/// Memory fence. /// Supported platforms: DG2, PVC /// -/// @tparam Ty is element type. -/// @tparam InternalOp is operation type. -/// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. -/// @tparam AddressScale is the address scale. -/// @tparam ImmOffset is the immediate offset added to each address. -/// @tparam DS is the data size. -/// @tparam VS is the number of elements per address. -/// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the SIMD size of operation (the number of addresses to access) -/// @param pred is predicates. -/// @param offsets is the zero-based offsets. -template -__ESIMD_INTRIN __ESIMD_DNS::vector_type_t()> -__esimd_lsc_xatomic_slm_0(__ESIMD_DNS::simd_mask_storage_t pred, - __ESIMD_DNS::vector_type_t offsets) -#ifdef __SYCL_DEVICE_ONLY__ - ; -#else // __SYCL_DEVICE_ONLY__ -{ - __ESIMD_UNSUPPORTED_ON_HOST; -} -#endif // __SYCL_DEVICE_ONLY__ - -/// SLM atomic. -/// Supported platforms: DG2, PVC -/// -/// @tparam Ty is element type. -/// @tparam InternalOp is operation type. -/// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. -/// @tparam AddressScale is the address scale. -/// @tparam ImmOffset is the immediate offset added to each address. -/// @tparam DS is the data size. -/// @tparam VS is the number of elements per address. -/// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the SIMD size of operation (the number of addresses to access) -/// @param pred is predicates. -/// @param offsets is the zero-based offsets. -/// @param src0 is the first atomic operand. -template -__ESIMD_INTRIN __ESIMD_DNS::vector_type_t()> -__esimd_lsc_xatomic_slm_1( - __ESIMD_DNS::simd_mask_storage_t pred, - __ESIMD_DNS::vector_type_t offsets, - __ESIMD_DNS::vector_type_t()> src0) -#ifdef __SYCL_DEVICE_ONLY__ - ; -#else // __SYCL_DEVICE_ONLY__ -{ - __ESIMD_UNSUPPORTED_ON_HOST; -} -#endif // __SYCL_DEVICE_ONLY__ - -/// SLM atomic. -/// Supported platforms: DG2, PVC -/// -/// @tparam Ty is element type. -/// @tparam InternalOp is operation type. -/// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. -/// @tparam AddressScale is the address scale. -/// @tparam ImmOffset is the immediate offset added to each address. -/// @tparam DS is the data size. -/// @tparam VS is the number of elements per address. -/// @tparam Transposed indicates if the data is transposed during the transfer. +/// @tparam Kind is the Sfid shaded function. +/// @tparam FenceOp is the fence operation. +/// @tparam Scope is the operation scope. /// @tparam N is the SIMD size of operation (the number of addresses to access) /// @param pred is predicates. -/// @param offsets is the zero-based offsets. -/// @param src0 is the first atomic operand. -/// @param src1 is the second atomic operand. -template -__ESIMD_INTRIN __ESIMD_DNS::vector_type_t()> -__esimd_lsc_xatomic_slm_2( - __ESIMD_DNS::simd_mask_storage_t pred, - __ESIMD_DNS::vector_type_t offsets, - __ESIMD_DNS::vector_type_t()> src0, - __ESIMD_DNS::vector_type_t()> src1) +template <__ESIMD_ENS::lsc_memory_kind Kind, __ESIMD_ENS::lsc_fence_op FenceOp, + __ESIMD_ENS::lsc_scope Scope, int N> +__ESIMD_INTRIN void __esimd_lsc_fence(__ESIMD_DNS::simd_mask_storage_t pred) #ifdef __SYCL_DEVICE_ONLY__ ; #else // __SYCL_DEVICE_ONLY__ diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp index 358af22ef00fb..e4cfca89f2953 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp @@ -2588,24 +2588,7 @@ template <__ESIMD_NS::atomic_op Op, typename T, int N, __ESIMD_API __ESIMD_NS::simd lsc_slm_atomic_update(__ESIMD_NS::simd offsets, __ESIMD_NS::simd_mask pred) { - static_assert(sizeof(T) == 2 || sizeof(T) == 4, "Unsupported data type"); - __ESIMD_EDNS::check_lsc_vector_size<1>(); - __ESIMD_EDNS::check_lsc_data_size(); - __ESIMD_DNS::check_atomic(); - constexpr uint16_t _AddressScale = 1; - constexpr int _ImmOffset = 0; - constexpr lsc_data_size _DS = - detail::expand_data_size(detail::finalize_data_size()); - constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<1>(); - constexpr detail::lsc_data_order _Transposed = - detail::lsc_data_order::nontranspose; - using MsgT = typename detail::lsc_expand_type::type; - constexpr int IOp = detail::lsc_to_internal_atomic_op(); - __ESIMD_NS::simd Tmp = - __esimd_lsc_xatomic_slm_0(pred.data(), offsets.data()); - return detail::lsc_format_ret(Tmp); + return __ESIMD_DNS::slm_atomic_update_impl(offsets, pred); } /// SLM atomic. @@ -2628,29 +2611,7 @@ __ESIMD_API __ESIMD_NS::simd lsc_slm_atomic_update(__ESIMD_NS::simd offsets, __ESIMD_NS::simd src0, __ESIMD_NS::simd_mask pred) { - static_assert(Op != __ESIMD_NS::atomic_op::fadd && - Op != __ESIMD_NS::atomic_op::fsub, - "fadd and fsub are not supported for slm."); - static_assert(sizeof(T) == 2 || sizeof(T) == 4, "Unsupported data type"); - detail::check_lsc_vector_size<1>(); - detail::check_lsc_data_size(); - __ESIMD_DNS::check_atomic(); - constexpr uint16_t _AddressScale = 1; - constexpr int _ImmOffset = 0; - constexpr lsc_data_size _DS = - detail::expand_data_size(detail::finalize_data_size()); - constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<1>(); - constexpr detail::lsc_data_order _Transposed = - detail::lsc_data_order::nontranspose; - using MsgT = typename detail::lsc_expand_type::type; - constexpr int IOp = detail::lsc_to_internal_atomic_op(); - __ESIMD_NS::simd Msg_data = detail::lsc_format_input(src0); - __ESIMD_NS::simd Tmp = - __esimd_lsc_xatomic_slm_1(pred.data(), offsets.data(), - Msg_data.data()); - return detail::lsc_format_ret(Tmp); + return __ESIMD_DNS::slm_atomic_update_impl(offsets, src0, pred); } /// SLM atomic. @@ -2674,29 +2635,8 @@ __ESIMD_API __ESIMD_NS::simd lsc_slm_atomic_update(__ESIMD_NS::simd offsets, __ESIMD_NS::simd src0, __ESIMD_NS::simd src1, __ESIMD_NS::simd_mask pred) { - static_assert(sizeof(T) == 2 || sizeof(T) == 4 || - (Op == __ESIMD_NS::atomic_op::cmpxchg && sizeof(T) == 8), - "Unsupported data type"); - detail::check_lsc_vector_size<1>(); - detail::check_lsc_data_size(); - __ESIMD_DNS::check_atomic(); - constexpr uint16_t _AddressScale = 1; - constexpr int _ImmOffset = 0; - constexpr lsc_data_size _DS = - detail::expand_data_size(detail::finalize_data_size()); - constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<1>(); - constexpr detail::lsc_data_order _Transposed = - detail::lsc_data_order::nontranspose; - using MsgT = typename detail::lsc_expand_type::type; - constexpr int IOp = detail::lsc_to_internal_atomic_op(); - __ESIMD_NS::simd Msg_data0 = detail::lsc_format_input(src0); - __ESIMD_NS::simd Msg_data1 = detail::lsc_format_input(src1); - __ESIMD_NS::simd Tmp = - __esimd_lsc_xatomic_slm_2( - pred.data(), offsets.data(), Msg_data0.data(), Msg_data1.data()); - return detail::lsc_format_ret(Tmp); + return __ESIMD_DNS::slm_atomic_update_impl(offsets, src0, src1, + pred); } /// USM pointer atomic. diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update_slm.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update_slm.hpp new file mode 100644 index 0000000000000..e025d118d77fa --- /dev/null +++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update_slm.hpp @@ -0,0 +1,756 @@ +//==-------atomic_update_slm.hpp - DPC++ ESIMD on-device test --------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "../../esimd_test_utils.hpp" + +#include +#include +#include + +using namespace sycl; +using namespace sycl::ext::intel::esimd; + +constexpr int Signed = 1; +constexpr int Unsigned = 2; + +constexpr int64_t threads_per_group = 8; +constexpr int64_t n_groups = 1; +constexpr int64_t start_ind = 3; +constexpr int64_t masked_lane = 1; +constexpr int64_t repeat = 1; +constexpr int64_t stride = 4; + +// Helper functions + +const char *to_string(atomic_op op) { + switch (op) { + case atomic_op::add: + return "add"; + case atomic_op::sub: + return "sub"; + case atomic_op::inc: + return "inc"; + case atomic_op::dec: + return "dec"; + case atomic_op::umin: + return "umin"; + case atomic_op::umax: + return "umax"; + case atomic_op::xchg: + return "xchg"; + case atomic_op::cmpxchg: + return "cmpxchg"; + case atomic_op::bit_and: + return "bit_and"; + case atomic_op::bit_or: + return "bit_or"; + case atomic_op::bit_xor: + return "bit_xor"; + case atomic_op::smin: + return "smin"; + case atomic_op::smax: + return "smax"; + case atomic_op::fmax: + return "fmax"; + case atomic_op::fmin: + return "fmin"; + case atomic_op::fadd: + return "fadd"; + case atomic_op::fsub: + return "fsub"; + case atomic_op::fcmpxchg: + return "fcmpxchg"; + case atomic_op::load: + return "load"; + case atomic_op::store: + return "store"; + case atomic_op::predec: + return "predec"; + } + return ""; +} + +template inline bool any(simd_mask m, simd_mask ignore_mask) { + simd_mask m1 = 0; + m.merge(m1, ignore_mask); + return m.any(); +} + +// The main test function + +template class ImplF, bool UseMask> +bool test_slm(queue q) { + constexpr auto op = ImplF::atomic_op; + using CurAtomicOpT = decltype(op); + constexpr int n_args = ImplF::n_args; + + std::cout << "SLM testing" << " op=" << to_string(op) + << " T=" << esimd_test::type_name() << " N=" << N << "\n\t" + << " UseMask=" << (UseMask ? "true" : "false") + << "{ thr_per_group=" << threads_per_group + << " n_groups=" << n_groups << " start_ind=" << start_ind + << " masked_lane=" << masked_lane << " repeat=" << repeat + << " stride=" << stride << " }..."; + + constexpr size_t size = start_ind + (N - 1) * stride + 1; + T *arr = malloc_shared(size, q); + constexpr int n_threads = threads_per_group * n_groups; + + for (int i = 0; i < size; ++i) { + arr[i] = ImplF::init(i); + } + + range<1> glob_rng(n_threads); + range<1> loc_rng(threads_per_group); + nd_range<1> rng(glob_rng, loc_rng); + + try { + auto e = q.submit([&](handler &cgh) { + cgh.parallel_for(rng, [=](sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL { + int i = ndi.get_global_id(0); + constexpr uint32_t SLMSize = size * sizeof(T); + slm_init(); + + simd offsets(start_ind * sizeof(T), stride * sizeof(T)); + simd data; + data.copy_from(arr); + + if (ndi.get_local_id(0) == 0) + slm_block_store(0, data); + + simd_mask m = 1; + if constexpr (UseMask) { + if (masked_lane < N) + m[masked_lane] = 0; + } + // Intra-work group barrier. + barrier(); + + // the atomic operation itself applied in a loop: + for (int cnt = 0; cnt < repeat; ++cnt) { + if constexpr (n_args == 0) { + if constexpr (UseMask) { + slm_atomic_update(offsets, m); + } else { + slm_atomic_update(offsets); + } + } else if constexpr (n_args == 1) { + simd v0 = ImplF::arg0(i); + if constexpr (UseMask) { + slm_atomic_update(offsets, v0, m); + } else { + slm_atomic_update(offsets, v0); + } + } else if constexpr (n_args == 2) { + simd new_val = ImplF::arg0(i); // new value + simd exp_val = ImplF::arg1(i); // expected value + // do compare-and-swap in a loop until we get expected value; + // arg0 and arg1 must provide values which guarantee the loop + // is not endless: + if constexpr (UseMask) { + for (simd old_val = + slm_atomic_update(offsets, new_val, exp_val, m); + any(old_val < exp_val, !m); + old_val = + slm_atomic_update(offsets, new_val, exp_val, m)) + ; + } else { + for (simd old_val = + slm_atomic_update(offsets, new_val, exp_val); + any(old_val < exp_val, !m); + old_val = + slm_atomic_update(offsets, new_val, exp_val)) + ; + } + } + } + barrier(); + if (ndi.get_local_id(0) == 0) { + auto data0 = slm_block_load(0); + data0.copy_to(arr); + } + }); + }); + e.wait(); + } catch (sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + free(arr, q); + return false; + } + int err_cnt = 0; + + for (int i = 0; i < size; ++i) { + T gold = ImplF::gold(i, UseMask); + T test = arr[i]; + + if ((gold != test) && (++err_cnt < 10)) { + if (err_cnt == 1) { + std::cout << "\n"; + } + std::cout << " failed at index " << i << ": " << test << " != " << gold + << "(gold)\n"; + } + } + if (err_cnt > 0) { + std::cout << " FAILED\n pass rate: " + << ((float)(size - err_cnt) / (float)size) * 100.0f << "% (" + << (size - err_cnt) << "/" << size << ")\n"; + } else { + std::cout << " passed\n"; + } + free(arr, q); + return err_cnt == 0; +} + +template class ImplF, bool UseMask> +bool test_slm_acc(queue q) { + constexpr auto op = ImplF::atomic_op; + using CurAtomicOpT = decltype(op); + constexpr int n_args = ImplF::n_args; + + std::cout << "SLM ACC testing" << " op=" << to_string(op) + << " T=" << esimd_test::type_name() << " N=" << N << "\n\t" + << " UseMask=" << (UseMask ? "true" : "false") + << "{ thr_per_group=" << threads_per_group + << " n_groups=" << n_groups << " start_ind=" << start_ind + << " masked_lane=" << masked_lane << " repeat=" << repeat + << " stride=" << stride << " }..."; + + constexpr size_t size = start_ind + (N - 1) * stride + 1; + T *arr = malloc_shared(size, q); + constexpr int n_threads = threads_per_group * n_groups; + + for (int i = 0; i < size; ++i) { + arr[i] = ImplF::init(i); + } + + range<1> glob_rng(n_threads); + range<1> loc_rng(threads_per_group); + nd_range<1> rng(glob_rng, loc_rng); + + try { + auto e = q.submit([&](handler &cgh) { + local_accessor LocalAcc(size, cgh); + cgh.parallel_for(rng, [=](sycl::nd_item<1> NDI) SYCL_ESIMD_KERNEL { + int i = NDI.get_global_id(0); + uint16_t LocalID = NDI.get_local_id(0); + simd offsets(start_ind * sizeof(T), stride * sizeof(T)); + + if (LocalID == 0) + for (int I = 0; I < threads_per_group * N; I++) + LocalAcc[I] = arr[i * N + I]; + barrier(); + + simd_mask m = 1; + if constexpr (UseMask) { + if (masked_lane < N) + m[masked_lane] = 0; + } + // Intra-work group barrier. + barrier(); + + // the atomic operation itself applied in a loop: + for (int cnt = 0; cnt < repeat; ++cnt) { + if constexpr (n_args == 0) { + if constexpr (UseMask) { + atomic_update(LocalAcc, offsets, m); + } else { + atomic_update(LocalAcc, offsets); + } + } else if constexpr (n_args == 1) { + simd v0 = ImplF::arg0(i); + if constexpr (UseMask) { + atomic_update(LocalAcc, offsets, v0, m); + } else { + atomic_update(LocalAcc, offsets, v0); + } + } else if constexpr (n_args == 2) { + simd new_val = ImplF::arg0(i); // new value + simd exp_val = ImplF::arg1(i); // expected value + // do compare-and-swap in a loop until we get expected value; + // arg0 and arg1 must provide values which guarantee the loop + // is not endless: + if constexpr (UseMask) { + for (simd old_val = atomic_update( + LocalAcc, offsets, new_val, exp_val, m); + any(old_val < exp_val, !m); + old_val = atomic_update(LocalAcc, offsets, new_val, + exp_val, m)) + ; + } else { + for (simd old_val = atomic_update(LocalAcc, offsets, + new_val, exp_val); + any(old_val < exp_val, !m); + old_val = atomic_update(LocalAcc, offsets, new_val, + exp_val)) + ; + } + } + } + barrier(); + if (LocalID == 0) + for (int I = 0; I < threads_per_group * N; I++) + arr[i * N + I] = LocalAcc[I]; + }); + }); + e.wait(); + } catch (sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + free(arr, q); + return false; + } + int err_cnt = 0; + + for (int i = 0; i < size; ++i) { + T gold = ImplF::gold(i, UseMask); + T test = arr[i]; + + if ((gold != test) && (++err_cnt < 10)) { + if (err_cnt == 1) { + std::cout << "\n"; + } + std::cout << " failed at index " << i << ": " << test << " != " << gold + << "(gold)\n"; + } + } + if (err_cnt > 0) { + std::cout << " FAILED\n pass rate: " + << ((float)(size - err_cnt) / (float)size) * 100.0f << "% (" + << (size - err_cnt) << "/" << size << ")\n"; + } else { + std::cout << " passed\n"; + } + free(arr, q); + return err_cnt == 0; +} + +// ----------------- Functions providing input and golden values for atomic +// ----------------- operations. + +static int dense_ind(int ind, int VL) { return (ind - start_ind) / stride; } + +static bool is_updated(int ind, int VL, bool use_mask) { + if ((ind < start_ind) || (((ind - start_ind) % stride) != 0)) { + return false; + } + int ii = dense_ind(ind, VL); + bool res = true; + if (use_mask) + res = (ii % VL) != masked_lane; + return res; +} + +// ----------------- Actual "traits" for each operation. + +template struct ImplIncBase { + static constexpr C atomic_op = Op; + static constexpr int n_args = 0; + + static T init(int i) { return (T)0; } + + static T gold(int i, bool use_mask) { + T gold = is_updated(i, N, use_mask) + ? (T)(repeat * threads_per_group * n_groups) + : init(i); + return gold; + } +}; + +template struct ImplDecBase { + static constexpr C atomic_op = Op; + static constexpr int n_args = 0; + static constexpr int base = 5; + + static T init(int i) { + return (T)(repeat * threads_per_group * n_groups + base); + } + + static T gold(int i, bool use_mask) { + T gold = is_updated(i, N, use_mask) ? (T)base : init(i); + return gold; + } +}; + +// The purpose of this is validate that floating point data is correctly +// processed. +constexpr float FPDELTA = 0.5f; + +template struct ImplLoadBase { + static constexpr C atomic_op = Op; + static constexpr int n_args = 0; + + static T init(int i) { return (T)(i + FPDELTA); } + + static T gold(int i, bool use_mask) { + T gold = init(i); + return gold; + } +}; + +template struct ImplStoreBase { + static constexpr C atomic_op = Op; + static constexpr int n_args = 1; + static constexpr T base = (T)(2 + FPDELTA); + + static T init(int i) { return 0; } + + static T gold(int i, bool use_mask) { + T gold = is_updated(i, N, use_mask) ? base : init(i); + return gold; + } + + static T arg0(int i) { return base; } +}; + +template struct ImplAdd { + static constexpr C atomic_op = Op; + static constexpr int n_args = 1; + + static T init(int i) { return 0; } + + static T gold(int i, bool use_mask) { + T gold = is_updated(i, N, use_mask) + ? (T)(repeat * threads_per_group * n_groups * (T)(1 + FPDELTA)) + : init(i); + return gold; + } + + static T arg0(int i) { return (T)(1 + FPDELTA); } +}; + +template struct ImplSub { + static constexpr C atomic_op = Op; + static constexpr int n_args = 1; + static constexpr T base = (T)(5 + FPDELTA); + + static T init(int i) { + return (T)(repeat * threads_per_group * n_groups * (T)(1 + FPDELTA) + base); + } + + static T gold(int i, bool use_mask) { + T gold = is_updated(i, N, use_mask) ? base : init(i); + return gold; + } + + static T arg0(int i) { return (T)(1 + FPDELTA); } +}; + +template struct ImplMin { + static constexpr C atomic_op = Op; + static constexpr int n_args = 1; + + static T init(int i) { return std::numeric_limits::max(); } + + static T gold(int i, bool use_mask) { + T ExpectedFoundMin; + if constexpr (std::is_signed_v) + ExpectedFoundMin = FPDELTA - (threads_per_group * n_groups - 1); + else + ExpectedFoundMin = FPDELTA; + T gold = is_updated(i, N, use_mask) ? ExpectedFoundMin : init(i); + return gold; + } + + static T arg0(int i) { + int64_t sign = std::is_signed_v ? -1 : 1; + return sign * i + FPDELTA; + } +}; + +template struct ImplMax { + static constexpr C atomic_op = Op; + static constexpr int n_args = 1; + + static T init(int i) { return std::numeric_limits::lowest(); } + + static T gold(int i, bool use_mask) { + T ExpectedFoundMax = FPDELTA; + if constexpr (!std::is_signed_v) + ExpectedFoundMax += threads_per_group * n_groups - 1; + + T gold = is_updated(i, N, use_mask) ? ExpectedFoundMax : init(i); + return gold; + } + + static T arg0(int i) { + int64_t sign = std::is_signed_v ? -1 : 1; + return sign * i + FPDELTA; + } +}; + +template +struct ImplStore : ImplStoreBase {}; +template +struct ImplLoad : ImplLoadBase {}; +template +struct ImplInc : ImplIncBase {}; +template +struct ImplDec : ImplDecBase {}; +template +struct ImplIntAdd : ImplAdd {}; +template +struct ImplIntSub : ImplSub {}; +template +struct ImplSMin : ImplMin {}; +template +struct ImplUMin : ImplMin {}; +template +struct ImplSMax : ImplMax {}; +template +struct ImplUMax : ImplMax {}; + +template +struct ImplFadd : ImplAdd {}; +template +struct ImplFsub : ImplSub {}; +template +struct ImplLSCFmin : ImplMin {}; +template +struct ImplLSCFmax : ImplMax {}; + +template struct ImplCmpxchgBase { + static constexpr C atomic_op = Op; + static constexpr int n_args = 2; + static constexpr T base = (T)(2 + FPDELTA); + + static T init(int i) { return base - 1; } + + static T gold(int i, bool use_mask) { + T gold = is_updated(i, N, use_mask) + ? (T)(threads_per_group * n_groups - 1 + base) + : init(i); + return gold; + } + + // "Replacement value" argument in CAS + static inline T arg0(int i) { return i + base; } + + // "Expected value" argument in CAS + static inline T arg1(int i) { return i + base - 1; } +}; + +template +struct ImplCmpxchg : ImplCmpxchgBase {}; + +template +struct ImplLSCFcmpwr : ImplCmpxchgBase {}; + +// ----------------- Main function and test combinations. + +template class ImplF, + bool UseMask> +auto run_test(queue q) { + if constexpr (UseAcc) { + return test_slm_acc(q); + } else { + return test_slm(q); + } +} + +template class Op, bool UseMask, + bool UsePVCFeatures, bool UseAcc, int SignMask = (Signed | Unsigned)> +bool test_int_types(queue q) { + bool passed = true; + if constexpr (SignMask & Signed) { + if constexpr (UsePVCFeatures) + passed &= run_test(q); + + passed &= run_test(q); + + if constexpr (UsePVCFeatures) { + passed &= run_test(q); + } + } + + if constexpr (SignMask & Unsigned) { + if constexpr (UsePVCFeatures) + passed &= run_test(q); + + passed &= run_test(q); + + if constexpr (UsePVCFeatures) { + passed &= run_test(q); + } + } + return passed; +} + +template class Op, bool UseMask, + bool UsePVCFeatures, bool UseAcc> +bool test_fp_types(queue q) { + bool passed = true; + if constexpr (UsePVCFeatures) { + if constexpr (std::is_same_v, + ImplLSCFmax> || + std::is_same_v, + ImplLSCFmin> || + std::is_same_v, + ImplLSCFcmpwr>) { + auto dev = q.get_device(); + if (dev.has(sycl::aspect::fp16)) { + passed &= run_test(q); + } + } + } + + passed &= run_test(q); + + if constexpr (UsePVCFeatures) { + // TODO: fmin/max for double does not pass validation likely due to + // a driver bug. fcmpwr is hanging. + if constexpr (!std::is_same_v, ImplLSCFmax> && + !std::is_same_v, ImplLSCFmin> && + !std::is_same_v, ImplLSCFcmpwr>) { + if (q.get_device().has(sycl::aspect::atomic64) && + q.get_device().has(sycl::aspect::fp64)) { + passed &= run_test(q); + } + } + } + return passed; +} + +template