@@ -570,6 +570,68 @@ namespace xsimd
570
570
batch_type imag = _mm256_permute4x64_pd (_mm256_unpackhi_pd (hi, lo), _MM_SHUFFLE (3 , 1 , 2 , 0 ));
571
571
return { real, imag };
572
572
}
573
+
574
+ // load_unaligned<batch_bool>
575
+ namespace detail
576
+ {
577
+ template <class T >
578
+ XSIMD_INLINE __m256i load_bool_avx2 (bool const * mem) noexcept
579
+ {
580
+ XSIMD_IF_CONSTEXPR (sizeof (T) == 1 )
581
+ {
582
+ return _mm256_sub_epi8 (_mm256_set1_epi8 (0 ), _mm256_loadu_si256 ((__m256i const *)mem));
583
+ }
584
+ else XSIMD_IF_CONSTEXPR (sizeof (T) == 2 )
585
+ {
586
+ auto bpack = _mm_loadu_si128 ((__m128i const *)mem);
587
+ return _mm256_sub_epi16 (_mm256_set1_epi8 (0 ), _mm256_cvtepu8_epi16 (bpack));
588
+ }
589
+ // GCC <12 have missing or buggy unaligned load intrinsics; use memcpy to work around this.
590
+ // GCC/Clang/MSVC will turn it into the correct load.
591
+ else XSIMD_IF_CONSTEXPR (sizeof (T) == 4 )
592
+ {
593
+ #if defined(__x86_64__)
594
+ uint64_t tmp;
595
+ memcpy (&tmp, mem, sizeof (tmp));
596
+ auto val = _mm_cvtsi64_si128 (tmp);
597
+ #else
598
+ __m128i val;
599
+ memcpy (&val, mem, sizeof (uint64_t ));
600
+ #endif
601
+ return _mm256_sub_epi32 (_mm256_set1_epi8 (0 ), _mm256_cvtepu8_epi32 (val));
602
+ }
603
+ else XSIMD_IF_CONSTEXPR (sizeof (T) == 8 )
604
+ {
605
+ uint32_t tmp;
606
+ memcpy (&tmp, mem, sizeof (tmp));
607
+ return _mm256_sub_epi64 (_mm256_set1_epi8 (0 ), _mm256_cvtepu8_epi64 (_mm_cvtsi32_si128 (tmp)));
608
+ }
609
+ else
610
+ {
611
+ assert (false && " unsupported arch/op combination" );
612
+ return __m256i {};
613
+ }
614
+ }
615
+ }
616
+
617
+ template <class T , class A , class = typename std::enable_if<std::is_integral<T>::value, void >::type>
618
+ XSIMD_INLINE batch_bool<T, A> load_unaligned (bool const * mem, batch_bool<T, A>, requires_arch<avx2>) noexcept
619
+ {
620
+ return batch_bool<T, A>(detail::load_bool_avx2<T>(mem));
621
+ }
622
+
623
+ template <class A >
624
+ XSIMD_INLINE batch_bool<float , A> load_unaligned (bool const * mem, batch_bool<float , A>, requires_arch<avx2>) noexcept
625
+ {
626
+ return batch_bool<float , A>(_mm256_castsi256_ps (detail::load_bool_avx2<float >(mem)));
627
+ }
628
+
629
+ template <class A >
630
+ XSIMD_INLINE batch_bool<double , A> load_unaligned (bool const * mem, batch_bool<double , A>, requires_arch<avx2>) noexcept
631
+ {
632
+ return batch_bool<double , A>(_mm256_castsi256_pd (detail::load_bool_avx2<double >(mem)));
633
+ }
634
+
573
635
// mask
574
636
template <class A , class T , class = typename std::enable_if<std::is_integral<T>::value, void >::type>
575
637
XSIMD_INLINE uint64_t mask (batch_bool<T, A> const & self, requires_arch<avx2>) noexcept
@@ -923,6 +985,66 @@ namespace xsimd
923
985
return _mm256_or_si256 (y, w);
924
986
}
925
987
988
+ // store<batch_bool>
989
+ namespace detail
990
+ {
991
+ template <class T >
992
+ XSIMD_INLINE void store_bool_avx2 (__m256i b, bool * mem, T) noexcept
993
+ {
994
+ // GCC <12 have missing or buggy unaligned store intrinsics; use memcpy to work around this.
995
+ // GCC/Clang/MSVC will turn it into the correct store.
996
+ XSIMD_IF_CONSTEXPR (sizeof (T) == 1 )
997
+ {
998
+ // negate mask to convert to 0 or 1
999
+ auto val = _mm256_sub_epi8 (_mm256_set1_epi8 (0 ), b);
1000
+ memcpy (mem, &val, sizeof (val));
1001
+ return ;
1002
+ }
1003
+
1004
+ auto b_hi = _mm256_extractf128_si256 (b, 1 );
1005
+ auto b_lo = _mm256_castsi256_si128 (b);
1006
+ XSIMD_IF_CONSTEXPR (sizeof (T) == 2 )
1007
+ {
1008
+ auto val = _mm_sub_epi8 (_mm_set1_epi8 (0 ), _mm_packs_epi16 (b_lo, b_hi));
1009
+ memcpy (mem, &val, sizeof (val));
1010
+ }
1011
+ else XSIMD_IF_CONSTEXPR (sizeof (T) == 4 )
1012
+ {
1013
+ auto pack_16 = _mm_packs_epi32 (b_lo, b_hi);
1014
+ auto val = _mm_sub_epi8 (_mm_set1_epi8 (0 ), _mm_packs_epi16 (pack_16, pack_16));
1015
+ #if defined(__x86_64__)
1016
+ auto val_lo = _mm_cvtsi128_si64 (val);
1017
+ memcpy (mem, &val_lo, sizeof (val_lo));
1018
+ #else
1019
+ memcpy (mem, &val, sizeof (uint64_t ));
1020
+ #endif
1021
+ }
1022
+ else XSIMD_IF_CONSTEXPR (sizeof (T) == 8 )
1023
+ {
1024
+ const auto bmask = _mm_set_epi8 (
1025
+ -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 ,
1026
+ -1 , -1 , -1 , -1 , -1 , -1 , 8 , 0 );
1027
+ auto pack = _mm_unpacklo_epi16 (_mm_shuffle_epi8 (b_lo, bmask), _mm_shuffle_epi8 (b_hi, bmask));
1028
+ uint32_t val = _mm_cvtsi128_si32 (_mm_sub_epi8 (_mm_set1_epi8 (0 ), pack));
1029
+ memcpy (mem, &val, sizeof (val));
1030
+ }
1031
+ else
1032
+ {
1033
+ assert (false && " unsupported arch/op combination" );
1034
+ }
1035
+ }
1036
+
1037
+ XSIMD_INLINE __m256i avx_to_i (__m256 x) { return _mm256_castps_si256 (x); }
1038
+ XSIMD_INLINE __m256i avx_to_i (__m256d x) { return _mm256_castpd_si256 (x); }
1039
+ XSIMD_INLINE __m256i avx_to_i (__m256i x) { return x; }
1040
+ }
1041
+
1042
+ template <class T , class A >
1043
+ XSIMD_INLINE void store (batch_bool<T, A> b, bool * mem, requires_arch<avx2>) noexcept
1044
+ {
1045
+ detail::store_bool_avx2 (detail::avx_to_i (b), mem, T {});
1046
+ }
1047
+
926
1048
// ssub
927
1049
template <class A , class T , class = typename std::enable_if<std::is_integral<T>::value, void >::type>
928
1050
XSIMD_INLINE batch<T, A> ssub (batch<T, A> const & self, batch<T, A> const & other, requires_arch<avx2>) noexcept
0 commit comments