Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions include/xsimd/arch/common/xsimd_common_memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,12 @@ namespace xsimd
return load_unaligned(mem, b, A {});
}

template <class A, class T>
XSIMD_INLINE batch_bool<T, A> load_stream(bool const* mem, batch_bool<T, A> b, requires_arch<common>) noexcept
{
return load_aligned(mem, b, A {});
}

// load_aligned
namespace detail
{
Expand Down Expand Up @@ -348,6 +354,12 @@ namespace xsimd
return detail::load_unaligned<A>(mem, cvt, common {}, detail::conversion_type<A, T_in, T_out> {});
}

template <class A, class T_in, class T_out>
XSIMD_INLINE batch<T_out, A> load_stream(T_in const* mem, convert<T_out> cvt, requires_arch<common>) noexcept
{
return load_aligned<A>(mem, cvt, A {});
}

// rotate_right
template <size_t N, class A, class T>
XSIMD_INLINE batch<T, A> rotate_right(batch<T, A> const& self, requires_arch<common>) noexcept
Expand Down Expand Up @@ -589,6 +601,12 @@ namespace xsimd
mem[i] = bool(buffer[i]);
}

template <class A, class T>
XSIMD_INLINE void store_stream(batch_bool<T, A> const& self, bool* mem, requires_arch<common>) noexcept
{
store(self, mem, A {});
}

// store_aligned
template <class A, class T_in, class T_out>
XSIMD_INLINE void store_aligned(T_out* mem, batch<T_in, A> const& self, requires_arch<common>) noexcept
Expand All @@ -607,6 +625,12 @@ namespace xsimd
return store_aligned<A>(mem, self, common {});
}

template <class A, class T_in, class T_out>
XSIMD_INLINE void store_stream(T_out* mem, batch<T_in, A> const& self, requires_arch<common>) noexcept
{
store_aligned<A>(mem, self, A {});
}

// swizzle
template <class A, class T, class ITy, ITy... Vs>
XSIMD_INLINE batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self, batch_constant<ITy, A, Vs...> mask, requires_arch<common>) noexcept
Expand Down Expand Up @@ -688,6 +712,12 @@ namespace xsimd
return detail::load_complex(hi, lo, A {});
}

template <class A, class T_out, class T_in>
XSIMD_INLINE batch<std::complex<T_out>, A> load_complex_stream(std::complex<T_in> const* mem, convert<std::complex<T_out>>, requires_arch<common>) noexcept
{
return load_complex_aligned<A>(mem, kernel::convert<std::complex<T_out>> {}, A {});
}

// store_complex_aligned
template <class A, class T_out, class T_in>
XSIMD_INLINE void store_complex_aligned(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<common>) noexcept
Expand All @@ -712,6 +742,12 @@ namespace xsimd
hi.store_unaligned(buffer + real_batch::size);
}

template <class A, class T_out, class T_in>
XSIMD_INLINE void store_complex_stream(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<common>) noexcept
{
store_complex_aligned<A>(dst, src, A {});
}

// transpose
template <class A, class T>
XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<common>) noexcept
Expand Down
17 changes: 17 additions & 0 deletions include/xsimd/arch/xsimd_avx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1404,6 +1404,23 @@ namespace xsimd
return _mm256_storeu_pd(mem, self);
}

// store_stream
template <class A>
XSIMD_INLINE void store_stream(float* mem, batch<float, A> const& self, requires_arch<avx>) noexcept
{
_mm256_stream_ps(mem, self);
}
template <class A>
XSIMD_INLINE void store_stream(double* mem, batch<double, A> const& self, requires_arch<avx>) noexcept
{
_mm256_stream_pd(mem, self);
}
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
XSIMD_INLINE void store_stream(T* mem, batch<T, A> const& self, requires_arch<avx>) noexcept
{
_mm256_stream_si256((__m256i*)mem, self);
}

// sub
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
Expand Down
17 changes: 17 additions & 0 deletions include/xsimd/arch/xsimd_avx2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,23 @@ namespace xsimd
}
}

// load_stream
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
XSIMD_INLINE batch<T, A> load_stream(T const* mem, convert<T>, requires_arch<avx2>) noexcept
{
return _mm256_stream_load_si256((__m256i const*)mem);
}
template <class A>
XSIMD_INLINE batch<float, A> load_stream(float const* mem, convert<float>, requires_arch<avx2>) noexcept
{
return _mm256_castsi256_ps(_mm256_stream_load_si256((__m256i const*)mem));
}
template <class A>
XSIMD_INLINE batch<double, A> load_stream(double const* mem, convert<double>, requires_arch<avx2>) noexcept
{
return _mm256_castsi256_pd(_mm256_stream_load_si256((__m256i const*)mem));
}

// bitwise_and
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
Expand Down
34 changes: 34 additions & 0 deletions include/xsimd/arch/xsimd_avx512f.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1391,6 +1391,23 @@ namespace xsimd
return _mm512_loadu_pd(mem);
}

// load_stream
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
XSIMD_INLINE batch<T, A> load_stream(T const* mem, convert<T>, requires_arch<avx512f>) noexcept
{
return _mm512_stream_load_si512((__m512i*)mem);
}
template <class A>
XSIMD_INLINE batch<float, A> load_stream(float const* mem, convert<float>, requires_arch<avx512f>) noexcept
{
return _mm512_castsi512_ps(_mm512_stream_load_si512((__m512i*)mem));
}
template <class A>
XSIMD_INLINE batch<double, A> load_stream(double const* mem, convert<double>, requires_arch<avx512f>) noexcept
{
return _mm512_castsi512_pd(_mm512_stream_load_si512((__m512i*)mem));
}

// lt
template <class A>
XSIMD_INLINE batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
Expand Down Expand Up @@ -2171,6 +2188,23 @@ namespace xsimd
return _mm512_storeu_pd(mem, self);
}

// store_stream
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
XSIMD_INLINE void store_stream(T* mem, batch<T, A> const& self, requires_arch<avx512f>) noexcept
{
_mm512_stream_si512((__m512i*)mem, self);
}
template <class A>
XSIMD_INLINE void store_stream(float* mem, batch<float, A> const& self, requires_arch<avx512f>) noexcept
{
_mm512_stream_ps(mem, self);
}
template <class A>
XSIMD_INLINE void store_stream(double* mem, batch<double, A> const& self, requires_arch<avx512f>) noexcept
{
_mm512_stream_pd(mem, self);
}

// sub
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
Expand Down
17 changes: 17 additions & 0 deletions include/xsimd/arch/xsimd_sse2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1741,6 +1741,23 @@ namespace xsimd
return _mm_storeu_pd(mem, self);
}

// store_stream
template <class A>
XSIMD_INLINE void store_stream(float* mem, batch<float, A> const& self, requires_arch<sse2>) noexcept
{
_mm_stream_ps(mem, self);
}
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
XSIMD_INLINE void store_stream(T* mem, batch<T, A> const& self, requires_arch<sse2>) noexcept
{
_mm_stream_si128((__m128i*)mem, self);
}
template <class A>
XSIMD_INLINE void store_stream(double* mem, batch<double, A> const& self, requires_arch<sse2>) noexcept
{
_mm_stream_pd(mem, self);
}

// sub
template <class A>
XSIMD_INLINE batch<float, A> sub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
Expand Down
17 changes: 17 additions & 0 deletions include/xsimd/arch/xsimd_sse4_1.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,23 @@ namespace xsimd
}
}

// load_stream
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
XSIMD_INLINE batch<T, A> load_stream(T const* mem, convert<T>, requires_arch<sse4_1>) noexcept
{
return _mm_stream_load_si128((__m128i*)mem);
}
template <class A>
XSIMD_INLINE batch<float, A> load_stream(float const* mem, convert<float>, requires_arch<sse4_1>) noexcept
{
return _mm_castsi128_ps(_mm_stream_load_si128((__m128i*)mem));
}
template <class A>
XSIMD_INLINE batch<double, A> load_stream(double const* mem, convert<double>, requires_arch<sse4_1>) noexcept
{
return _mm_castsi128_pd(_mm_stream_load_si128((__m128i*)mem));
}

// min
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
Expand Down
11 changes: 11 additions & 0 deletions include/xsimd/memory/xsimd_alignment.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,17 @@ namespace xsimd
{
};

/**
* @struct stream_mode
* @brief tag for load and store of aligned non-temporal memory.
*
* Streaming accesses expect aligned pointers. When no architecture-specific
* implementation is available, they fall back to aligned semantics.
*/
struct stream_mode
{
};

/***********************
* Allocator alignment *
***********************/
Expand Down
84 changes: 84 additions & 0 deletions include/xsimd/types/xsimd_api.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#ifndef XSIMD_API_HPP
#define XSIMD_API_HPP

#include <atomic>
#include <complex>
#include <cstddef>
#include <limits>
Expand Down Expand Up @@ -1334,6 +1335,30 @@ namespace xsimd
return kernel::load_complex_aligned<A>(ptr, kernel::convert<batch_value_type> {}, A {});
}

template <class To, class A = default_arch, class From>
XSIMD_INLINE simd_return_type<From, To, A> load_as(From const* ptr, stream_mode) noexcept
{
using batch_value_type = typename simd_return_type<From, To, A>::value_type;
detail::static_check_supported_config<From, A>();
detail::static_check_supported_config<To, A>();
return kernel::load_stream<A>(ptr, kernel::convert<batch_value_type> {}, A {});
}

template <class To, class A = default_arch>
XSIMD_INLINE simd_return_type<bool, To, A> load_as(bool const* ptr, stream_mode) noexcept
{
detail::static_check_supported_config<To, A>();
return simd_return_type<bool, To, A>::load_stream(ptr);
}

template <class To, class A = default_arch, class From>
XSIMD_INLINE simd_return_type<std::complex<From>, To, A> load_as(std::complex<From> const* ptr, stream_mode) noexcept
{
detail::static_check_supported_config<To, A>();
using batch_value_type = typename simd_return_type<std::complex<From>, To, A>::value_type;
return kernel::load_complex_stream<A>(ptr, kernel::convert<batch_value_type> {}, A {});
}

#ifdef XSIMD_ENABLE_XTL_COMPLEX
template <class To, class A = default_arch, class From, bool i3ec>
XSIMD_INLINE simd_return_type<xtl::xcomplex<From, From, i3ec>, To, A> load_as(xtl::xcomplex<From, From, i3ec> const* ptr, aligned_mode) noexcept
Expand All @@ -1342,6 +1367,14 @@ namespace xsimd
detail::static_check_supported_config<From, A>();
return load_as<To>(reinterpret_cast<std::complex<From> const*>(ptr), aligned_mode());
}

template <class To, class A = default_arch, class From, bool i3ec>
XSIMD_INLINE simd_return_type<xtl::xcomplex<From, From, i3ec>, To, A> load_as(xtl::xcomplex<From, From, i3ec> const* ptr, stream_mode) noexcept
{
detail::static_check_supported_config<To, A>();
detail::static_check_supported_config<From, A>();
return load_as<To>(reinterpret_cast<std::complex<From> const*>(ptr), stream_mode());
}
#endif

/**
Expand Down Expand Up @@ -1416,6 +1449,13 @@ namespace xsimd
return load_as<From, A>(ptr, unaligned_mode {});
}

template <class A = default_arch, class From>
XSIMD_INLINE batch<From, A> load(From const* ptr, stream_mode) noexcept
{
detail::static_check_supported_config<From, A>();
return load_as<From, A>(ptr, stream_mode {});
}

/**
* @ingroup batch_data_transfer
*
Expand Down Expand Up @@ -2339,12 +2379,40 @@ namespace xsimd
kernel::store_complex_aligned<A>(dst, src, A {});
}

template <class To, class A = default_arch, class From>
XSIMD_INLINE void store_as(To* dst, batch<From, A> const& src, stream_mode) noexcept
{
detail::static_check_supported_config<From, A>();
kernel::store_stream<A>(dst, src, A {});
}

template <class A = default_arch, class From>
XSIMD_INLINE void store_as(bool* dst, batch_bool<From, A> const& src, stream_mode) noexcept
{
detail::static_check_supported_config<From, A>();
kernel::store_stream<A>(src, dst, A {});
}

template <class To, class A = default_arch, class From>
XSIMD_INLINE void store_as(std::complex<To>* dst, batch<std::complex<From>, A> const& src, stream_mode) noexcept
{
detail::static_check_supported_config<std::complex<From>, A>();
kernel::store_complex_stream<A>(dst, src, A {});
}

#ifdef XSIMD_ENABLE_XTL_COMPLEX
template <class To, class A = default_arch, class From, bool i3ec>
XSIMD_INLINE void store_as(xtl::xcomplex<To, To, i3ec>* dst, batch<std::complex<From>, A> const& src, aligned_mode) noexcept
{
store_as(reinterpret_cast<std::complex<To>*>(dst), src, aligned_mode());
}

template <class To, class A = default_arch, class From, bool i3ec>
XSIMD_INLINE void store_as(xtl::xcomplex<To, To, i3ec>* dst, batch<std::complex<From>, A> const& src, stream_mode) noexcept
{
detail::static_check_supported_config<std::complex<From>, A>();
store_as(reinterpret_cast<std::complex<To>*>(dst), src, stream_mode());
}
#endif

/**
Expand Down Expand Up @@ -2413,6 +2481,22 @@ namespace xsimd
store_as<T, A>(mem, val, unaligned_mode {});
}

template <class A, class T>
XSIMD_INLINE void store(T* mem, batch<T, A> const& val, stream_mode) noexcept
{
store_as<T, A>(mem, val, stream_mode {});
}

/**
* @ingroup batch_data_transfer
*
* Issues a sequentially consistent memory fence.
*/
XSIMD_INLINE void fence() noexcept
{
std::atomic_thread_fence(std::memory_order_seq_cst);
}

/**
* @ingroup batch_data_transfer
*
Expand Down
Loading
Loading