From 66fd3230ed0852e55e058cd0bc83ead7ed76fa4e Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Tue, 23 Sep 2025 18:14:53 -0400 Subject: [PATCH] 1. Adding stream API for non temporal data transfers 2. Adding xsimd::fence as a wrapper around std atomic for cache coherence 3. Adding tests --- .../xsimd/arch/common/xsimd_common_memory.hpp | 36 ++++++++ include/xsimd/arch/xsimd_avx.hpp | 17 ++++ include/xsimd/arch/xsimd_avx2.hpp | 17 ++++ include/xsimd/arch/xsimd_avx512f.hpp | 34 ++++++++ include/xsimd/arch/xsimd_sse2.hpp | 17 ++++ include/xsimd/arch/xsimd_sse4_1.hpp | 17 ++++ include/xsimd/memory/xsimd_alignment.hpp | 11 +++ include/xsimd/types/xsimd_api.hpp | 84 +++++++++++++++++++ include/xsimd/types/xsimd_batch.hpp | 62 ++++++++++++++ test/test_load_store.cpp | 56 +++++++++++++ 10 files changed, 351 insertions(+) diff --git a/include/xsimd/arch/common/xsimd_common_memory.hpp b/include/xsimd/arch/common/xsimd_common_memory.hpp index 4ad148a6f..d2a93a3b1 100644 --- a/include/xsimd/arch/common/xsimd_common_memory.hpp +++ b/include/xsimd/arch/common/xsimd_common_memory.hpp @@ -298,6 +298,12 @@ namespace xsimd return load_unaligned(mem, b, A {}); } + template + XSIMD_INLINE batch_bool load_stream(bool const* mem, batch_bool b, requires_arch) noexcept + { + return load_aligned(mem, b, A {}); + } + // load_aligned namespace detail { @@ -348,6 +354,12 @@ namespace xsimd return detail::load_unaligned(mem, cvt, common {}, detail::conversion_type {}); } + template + XSIMD_INLINE batch load_stream(T_in const* mem, convert cvt, requires_arch) noexcept + { + return load_aligned(mem, cvt, A {}); + } + // rotate_right template XSIMD_INLINE batch rotate_right(batch const& self, requires_arch) noexcept @@ -589,6 +601,12 @@ namespace xsimd mem[i] = bool(buffer[i]); } + template + XSIMD_INLINE void store_stream(batch_bool const& self, bool* mem, requires_arch) noexcept + { + store(self, mem, A {}); + } + // store_aligned template XSIMD_INLINE void store_aligned(T_out* mem, batch const& self, requires_arch) noexcept @@ -607,6 +625,12 @@ namespace xsimd return store_aligned(mem, self, common {}); } + template + XSIMD_INLINE void store_stream(T_out* mem, batch const& self, requires_arch) noexcept + { + store_aligned(mem, self, A {}); + } + // swizzle template XSIMD_INLINE batch, A> swizzle(batch, A> const& self, batch_constant mask, requires_arch) noexcept @@ -688,6 +712,12 @@ namespace xsimd return detail::load_complex(hi, lo, A {}); } + template + XSIMD_INLINE batch, A> load_complex_stream(std::complex const* mem, convert>, requires_arch) noexcept + { + return load_complex_aligned(mem, kernel::convert> {}, A {}); + } + // store_complex_aligned template XSIMD_INLINE void store_complex_aligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept @@ -712,6 +742,12 @@ namespace xsimd hi.store_unaligned(buffer + real_batch::size); } + template + XSIMD_INLINE void store_complex_stream(std::complex* dst, batch, A> const& src, requires_arch) noexcept + { + store_complex_aligned(dst, src, A {}); + } + // transpose template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp index 9d93be071..836f5a161 100644 --- a/include/xsimd/arch/xsimd_avx.hpp +++ b/include/xsimd/arch/xsimd_avx.hpp @@ -1404,6 +1404,23 @@ namespace xsimd return _mm256_storeu_pd(mem, self); } + // store_stream + template + XSIMD_INLINE void store_stream(float* mem, batch const& self, requires_arch) noexcept + { + _mm256_stream_ps(mem, self); + } + template + XSIMD_INLINE void store_stream(double* mem, batch const& self, requires_arch) noexcept + { + _mm256_stream_pd(mem, self); + } + template ::value, void>::type> + XSIMD_INLINE void store_stream(T* mem, batch const& self, requires_arch) noexcept + { + _mm256_stream_si256((__m256i*)mem, self); + } + // sub template ::value, void>::type> XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp index 2acc650b7..837e58dfa 100644 --- a/include/xsimd/arch/xsimd_avx2.hpp +++ b/include/xsimd/arch/xsimd_avx2.hpp @@ -116,6 +116,23 @@ namespace xsimd } } + // load_stream + template ::value, void>::type> + XSIMD_INLINE batch load_stream(T const* mem, convert, requires_arch) noexcept + { + return _mm256_stream_load_si256((__m256i const*)mem); + } + template + XSIMD_INLINE batch load_stream(float const* mem, convert, requires_arch) noexcept + { + return _mm256_castsi256_ps(_mm256_stream_load_si256((__m256i const*)mem)); + } + template + XSIMD_INLINE batch load_stream(double const* mem, convert, requires_arch) noexcept + { + return _mm256_castsi256_pd(_mm256_stream_load_si256((__m256i const*)mem)); + } + // bitwise_and template ::value, void>::type> XSIMD_INLINE batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp index 4daf0a02f..bfad8c4c4 100644 --- a/include/xsimd/arch/xsimd_avx512f.hpp +++ b/include/xsimd/arch/xsimd_avx512f.hpp @@ -1391,6 +1391,23 @@ namespace xsimd return _mm512_loadu_pd(mem); } + // load_stream + template ::value, void>::type> + XSIMD_INLINE batch load_stream(T const* mem, convert, requires_arch) noexcept + { + return _mm512_stream_load_si512((__m512i*)mem); + } + template + XSIMD_INLINE batch load_stream(float const* mem, convert, requires_arch) noexcept + { + return _mm512_castsi512_ps(_mm512_stream_load_si512((__m512i*)mem)); + } + template + XSIMD_INLINE batch load_stream(double const* mem, convert, requires_arch) noexcept + { + return _mm512_castsi512_pd(_mm512_stream_load_si512((__m512i*)mem)); + } + // lt template XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept @@ -2171,6 +2188,23 @@ namespace xsimd return _mm512_storeu_pd(mem, self); } + // store_stream + template ::value, void>::type> + XSIMD_INLINE void store_stream(T* mem, batch const& self, requires_arch) noexcept + { + _mm512_stream_si512((__m512i*)mem, self); + } + template + XSIMD_INLINE void store_stream(float* mem, batch const& self, requires_arch) noexcept + { + _mm512_stream_ps(mem, self); + } + template + XSIMD_INLINE void store_stream(double* mem, batch const& self, requires_arch) noexcept + { + _mm512_stream_pd(mem, self); + } + // sub template ::value, void>::type> XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp index ac98e522a..279a2db46 100644 --- a/include/xsimd/arch/xsimd_sse2.hpp +++ b/include/xsimd/arch/xsimd_sse2.hpp @@ -1741,6 +1741,23 @@ namespace xsimd return _mm_storeu_pd(mem, self); } + // store_stream + template + XSIMD_INLINE void store_stream(float* mem, batch const& self, requires_arch) noexcept + { + _mm_stream_ps(mem, self); + } + template ::value, void>::type> + XSIMD_INLINE void store_stream(T* mem, batch const& self, requires_arch) noexcept + { + _mm_stream_si128((__m128i*)mem, self); + } + template + XSIMD_INLINE void store_stream(double* mem, batch const& self, requires_arch) noexcept + { + _mm_stream_pd(mem, self); + } + // sub template XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_sse4_1.hpp b/include/xsimd/arch/xsimd_sse4_1.hpp index 7fce2c314..c3820c40d 100644 --- a/include/xsimd/arch/xsimd_sse4_1.hpp +++ b/include/xsimd/arch/xsimd_sse4_1.hpp @@ -166,6 +166,23 @@ namespace xsimd } } + // load_stream + template ::value, void>::type> + XSIMD_INLINE batch load_stream(T const* mem, convert, requires_arch) noexcept + { + return _mm_stream_load_si128((__m128i*)mem); + } + template + XSIMD_INLINE batch load_stream(float const* mem, convert, requires_arch) noexcept + { + return _mm_castsi128_ps(_mm_stream_load_si128((__m128i*)mem)); + } + template + XSIMD_INLINE batch load_stream(double const* mem, convert, requires_arch) noexcept + { + return _mm_castsi128_pd(_mm_stream_load_si128((__m128i*)mem)); + } + // min template ::value, void>::type> XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept diff --git a/include/xsimd/memory/xsimd_alignment.hpp b/include/xsimd/memory/xsimd_alignment.hpp index 2d59ac1fc..fd1918bea 100644 --- a/include/xsimd/memory/xsimd_alignment.hpp +++ b/include/xsimd/memory/xsimd_alignment.hpp @@ -33,6 +33,17 @@ namespace xsimd { }; + /** + * @struct stream_mode + * @brief tag for load and store of aligned non-temporal memory. + * + * Streaming accesses expect aligned pointers. When no architecture-specific + * implementation is available, they fall back to aligned semantics. + */ + struct stream_mode + { + }; + /*********************** * Allocator alignment * ***********************/ diff --git a/include/xsimd/types/xsimd_api.hpp b/include/xsimd/types/xsimd_api.hpp index afaf2cdf1..056de35b6 100644 --- a/include/xsimd/types/xsimd_api.hpp +++ b/include/xsimd/types/xsimd_api.hpp @@ -12,6 +12,7 @@ #ifndef XSIMD_API_HPP #define XSIMD_API_HPP +#include #include #include #include @@ -1334,6 +1335,30 @@ namespace xsimd return kernel::load_complex_aligned(ptr, kernel::convert {}, A {}); } + template + XSIMD_INLINE simd_return_type load_as(From const* ptr, stream_mode) noexcept + { + using batch_value_type = typename simd_return_type::value_type; + detail::static_check_supported_config(); + detail::static_check_supported_config(); + return kernel::load_stream(ptr, kernel::convert {}, A {}); + } + + template + XSIMD_INLINE simd_return_type load_as(bool const* ptr, stream_mode) noexcept + { + detail::static_check_supported_config(); + return simd_return_type::load_stream(ptr); + } + + template + XSIMD_INLINE simd_return_type, To, A> load_as(std::complex const* ptr, stream_mode) noexcept + { + detail::static_check_supported_config(); + using batch_value_type = typename simd_return_type, To, A>::value_type; + return kernel::load_complex_stream(ptr, kernel::convert {}, A {}); + } + #ifdef XSIMD_ENABLE_XTL_COMPLEX template XSIMD_INLINE simd_return_type, To, A> load_as(xtl::xcomplex const* ptr, aligned_mode) noexcept @@ -1342,6 +1367,14 @@ namespace xsimd detail::static_check_supported_config(); return load_as(reinterpret_cast const*>(ptr), aligned_mode()); } + + template + XSIMD_INLINE simd_return_type, To, A> load_as(xtl::xcomplex const* ptr, stream_mode) noexcept + { + detail::static_check_supported_config(); + detail::static_check_supported_config(); + return load_as(reinterpret_cast const*>(ptr), stream_mode()); + } #endif /** @@ -1416,6 +1449,13 @@ namespace xsimd return load_as(ptr, unaligned_mode {}); } + template + XSIMD_INLINE batch load(From const* ptr, stream_mode) noexcept + { + detail::static_check_supported_config(); + return load_as(ptr, stream_mode {}); + } + /** * @ingroup batch_data_transfer * @@ -2339,12 +2379,40 @@ namespace xsimd kernel::store_complex_aligned(dst, src, A {}); } + template + XSIMD_INLINE void store_as(To* dst, batch const& src, stream_mode) noexcept + { + detail::static_check_supported_config(); + kernel::store_stream(dst, src, A {}); + } + + template + XSIMD_INLINE void store_as(bool* dst, batch_bool const& src, stream_mode) noexcept + { + detail::static_check_supported_config(); + kernel::store_stream(src, dst, A {}); + } + + template + XSIMD_INLINE void store_as(std::complex* dst, batch, A> const& src, stream_mode) noexcept + { + detail::static_check_supported_config, A>(); + kernel::store_complex_stream(dst, src, A {}); + } + #ifdef XSIMD_ENABLE_XTL_COMPLEX template XSIMD_INLINE void store_as(xtl::xcomplex* dst, batch, A> const& src, aligned_mode) noexcept { store_as(reinterpret_cast*>(dst), src, aligned_mode()); } + + template + XSIMD_INLINE void store_as(xtl::xcomplex* dst, batch, A> const& src, stream_mode) noexcept + { + detail::static_check_supported_config, A>(); + store_as(reinterpret_cast*>(dst), src, stream_mode()); + } #endif /** @@ -2413,6 +2481,22 @@ namespace xsimd store_as(mem, val, unaligned_mode {}); } + template + XSIMD_INLINE void store(T* mem, batch const& val, stream_mode) noexcept + { + store_as(mem, val, stream_mode {}); + } + + /** + * @ingroup batch_data_transfer + * + * Issues a sequentially consistent memory fence. + */ + XSIMD_INLINE void fence() noexcept + { + std::atomic_thread_fence(std::memory_order_seq_cst); + } + /** * @ingroup batch_data_transfer * diff --git a/include/xsimd/types/xsimd_batch.hpp b/include/xsimd/types/xsimd_batch.hpp index b3b704666..b00cadf69 100644 --- a/include/xsimd/types/xsimd_batch.hpp +++ b/include/xsimd/types/xsimd_batch.hpp @@ -142,6 +142,8 @@ namespace xsimd XSIMD_INLINE void store(U* mem, aligned_mode) const noexcept; template XSIMD_INLINE void store(U* mem, unaligned_mode) const noexcept; + template + XSIMD_INLINE void store(U* mem, stream_mode) const noexcept; template XSIMD_NO_DISCARD static XSIMD_INLINE batch load_aligned(U const* mem) noexcept; @@ -151,6 +153,8 @@ namespace xsimd XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, aligned_mode) noexcept; template XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, unaligned_mode) noexcept; + template + XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, stream_mode) noexcept; template XSIMD_NO_DISCARD static XSIMD_INLINE batch gather(U const* src, batch const& index) noexcept; @@ -313,8 +317,10 @@ namespace xsimd // memory operators XSIMD_INLINE void store_aligned(bool* mem) const noexcept; XSIMD_INLINE void store_unaligned(bool* mem) const noexcept; + XSIMD_INLINE void store_stream(bool* mem) const noexcept; XSIMD_NO_DISCARD static XSIMD_INLINE batch_bool load_aligned(bool const* mem) noexcept; XSIMD_NO_DISCARD static XSIMD_INLINE batch_bool load_unaligned(bool const* mem) noexcept; + XSIMD_NO_DISCARD static XSIMD_INLINE batch_bool load_stream(bool const* mem) noexcept; XSIMD_INLINE bool get(std::size_t i) const noexcept; @@ -404,9 +410,13 @@ namespace xsimd template XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, unaligned_mode) noexcept; template + XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, stream_mode) noexcept; + template XSIMD_INLINE void store(U* mem, aligned_mode) const noexcept; template XSIMD_INLINE void store(U* mem, unaligned_mode) const noexcept; + template + XSIMD_INLINE void store(U* mem, stream_mode) const noexcept; XSIMD_INLINE real_batch real() const noexcept; XSIMD_INLINE real_batch imag() const noexcept; @@ -616,6 +626,16 @@ namespace xsimd return store_unaligned(mem); } + template + template + XSIMD_INLINE void batch::store(U* mem, stream_mode) const noexcept + { + detail::static_check_supported_config(); + assert(((reinterpret_cast(mem) % A::alignment()) == 0) + && "store location is not properly aligned"); + kernel::store_stream(mem, *this, A {}); + } + /** * Loading from aligned memory. May involve a conversion if \c U is different * from \c T. @@ -664,6 +684,16 @@ namespace xsimd return load_unaligned(mem); } + template + template + XSIMD_INLINE batch batch::load(U const* mem, stream_mode) noexcept + { + detail::static_check_supported_config(); + assert(((reinterpret_cast(mem) % A::alignment()) == 0) + && "loaded pointer is not properly aligned"); + return kernel::load_stream(mem, kernel::convert {}, A {}); + } + /** * Create a new batch gathering elements starting at address \c src and * offset by each element in \c index. @@ -987,6 +1017,12 @@ namespace xsimd store_aligned(mem); } + template + XSIMD_INLINE void batch_bool::store_stream(bool* mem) const noexcept + { + kernel::store_stream(*this, mem, A {}); + } + template XSIMD_INLINE batch_bool batch_bool::load_aligned(bool const* mem) noexcept { @@ -999,6 +1035,12 @@ namespace xsimd return kernel::load_unaligned(mem, batch_bool(), A {}); } + template + XSIMD_INLINE batch_bool batch_bool::load_stream(bool const* mem) noexcept + { + return kernel::load_stream(mem, batch_bool(), A {}); + } + /** * Extract a scalar mask representation from this @c batch_bool. * @@ -1245,6 +1287,16 @@ namespace xsimd return load_unaligned(mem); } + template + template + XSIMD_INLINE batch, A> batch, A>::load(U const* mem, stream_mode) noexcept + { + assert(((reinterpret_cast(mem) % A::alignment()) == 0) + && "loaded pointer is not properly aligned"); + auto* ptr = reinterpret_cast(mem); + return kernel::load_complex_stream(ptr, kernel::convert {}, A {}); + } + template template XSIMD_INLINE void batch, A>::store(U* mem, aligned_mode) const noexcept @@ -1259,6 +1311,16 @@ namespace xsimd return store_unaligned(mem); } + template + template + XSIMD_INLINE void batch, A>::store(U* mem, stream_mode) const noexcept + { + assert(((reinterpret_cast(mem) % A::alignment()) == 0) + && "store location is not properly aligned"); + auto* ptr = reinterpret_cast(mem); + return kernel::store_complex_stream(ptr, *this, A {}); + } + template XSIMD_INLINE auto batch, A>::real() const noexcept -> real_batch { diff --git a/test/test_load_store.cpp b/test/test_load_store.cpp index 449c41e85..4258eea65 100644 --- a/test/test_load_store.cpp +++ b/test/test_load_store.cpp @@ -13,6 +13,7 @@ #ifndef XSIMD_NO_SUPPORTED_ARCHITECTURE #include +#include #include "test_utils.hpp" @@ -173,6 +174,33 @@ struct load_store_test }; #endif + template + void stream_load_if_same(Ptr const* ptr, batch_type& b, array_type const& expected_values, const std::string& name, + std::true_type) const + { + b = xsimd::load(ptr, xsimd::stream_mode()); + INFO(name, " stream (load)"); + CHECK_BATCH_EQ(b, expected_values); + } + + template + void stream_load_if_same(Ptr const*, batch_type&, array_type const&, const std::string&, std::false_type) const + { + } + + template + void stream_store_if_same(Vec& res, batch_type const& b, Vec const& reference, const std::string& name, std::true_type) const + { + xsimd::store(res.data(), b, xsimd::stream_mode()); + INFO(name, " stream (store)"); + CHECK_VECTOR_EQ(res, reference); + } + + template + void stream_store_if_same(Vec&, batch_type const&, Vec const&, const std::string&, std::false_type) const + { + } + template void test_load_impl(const V& v, const std::string& name) { @@ -186,6 +214,10 @@ struct load_store_test INFO(name, " aligned"); CHECK_BATCH_EQ(b, expected); + b = batch_type::load(v.data(), xsimd::stream_mode()); + INFO(name, " stream (batch::load)"); + CHECK_BATCH_EQ(b, expected); + b = xsimd::load_as(v.data(), xsimd::unaligned_mode()); INFO(name, " unaligned (load_as)"); CHECK_BATCH_EQ(b, expected); @@ -193,6 +225,13 @@ struct load_store_test b = xsimd::load_as(v.data(), xsimd::aligned_mode()); INFO(name, " aligned (load_as)"); CHECK_BATCH_EQ(b, expected); + + b = xsimd::load_as(v.data(), xsimd::stream_mode()); + INFO(name, " stream (load_as)"); + CHECK_BATCH_EQ(b, expected); + + stream_load_if_same(v.data(), b, expected, name, + std::integral_constant::value> {}); } struct test_load_char @@ -227,6 +266,17 @@ struct load_store_test xsimd::store_as(res.data(), b, xsimd::aligned_mode()); INFO(name, " aligned (store_as)"); CHECK_VECTOR_EQ(res, v); + + b.store(res.data(), xsimd::stream_mode()); + INFO(name, " stream (batch::store)"); + CHECK_VECTOR_EQ(res, v); + + xsimd::store_as(res.data(), b, xsimd::stream_mode()); + INFO(name, " stream (store_as)"); + CHECK_VECTOR_EQ(res, v); + + stream_store_if_same(res, b, v, name, + std::integral_constant::value> {}); } template @@ -301,4 +351,10 @@ TEST_CASE_TEMPLATE("[load store]", B, BATCH_TYPES) SUBCASE("scatter") { Test.test_scatter(); } } + +TEST_CASE("[fence] sequential consistency") +{ + xsimd::fence(); + CHECK(true); +} #endif