@@ -572,64 +572,56 @@ namespace xsimd
572
572
}
573
573
574
574
// load_unaligned<batch_bool>
575
- namespace detail
575
+
576
+ template <class T , class A , class = typename std::enable_if<std::is_integral<T>::value, void >::type>
577
+ XSIMD_INLINE batch_bool<T, A> load_unaligned (bool const * mem, batch_bool<T, A>, requires_arch<avx2>) noexcept
576
578
{
577
- template <class T >
578
- XSIMD_INLINE __m256i load_bool_avx2 (bool const * mem) noexcept
579
+ XSIMD_IF_CONSTEXPR (sizeof (T) == 1 )
580
+ {
581
+ return { _mm256_sub_epi8 (_mm256_set1_epi8 (0 ), _mm256_loadu_si256 ((__m256i const *)mem)) };
582
+ }
583
+ else XSIMD_IF_CONSTEXPR (sizeof (T) == 2 )
584
+ {
585
+ auto bpack = _mm_loadu_si128 ((__m128i const *)mem);
586
+ return { _mm256_sub_epi16 (_mm256_set1_epi8 (0 ), _mm256_cvtepu8_epi16 (bpack)) };
587
+ }
588
+ // GCC <12 have missing or buggy unaligned load intrinsics; use memcpy to work around this.
589
+ // GCC/Clang/MSVC will turn it into the correct load.
590
+ else XSIMD_IF_CONSTEXPR (sizeof (T) == 4 )
579
591
{
580
- XSIMD_IF_CONSTEXPR (sizeof (T) == 1 )
581
- {
582
- return _mm256_sub_epi8 (_mm256_set1_epi8 (0 ), _mm256_loadu_si256 ((__m256i const *)mem));
583
- }
584
- else XSIMD_IF_CONSTEXPR (sizeof (T) == 2 )
585
- {
586
- auto bpack = _mm_loadu_si128 ((__m128i const *)mem);
587
- return _mm256_sub_epi16 (_mm256_set1_epi8 (0 ), _mm256_cvtepu8_epi16 (bpack));
588
- }
589
- // GCC <12 have missing or buggy unaligned load intrinsics; use memcpy to work around this.
590
- // GCC/Clang/MSVC will turn it into the correct load.
591
- else XSIMD_IF_CONSTEXPR (sizeof (T) == 4 )
592
- {
593
592
#if defined(__x86_64__)
594
- uint64_t tmp;
595
- memcpy (&tmp, mem, sizeof (tmp));
596
- auto val = _mm_cvtsi64_si128 (tmp);
593
+ uint64_t tmp;
594
+ memcpy (&tmp, mem, sizeof (tmp));
595
+ auto val = _mm_cvtsi64_si128 (tmp);
597
596
#else
598
- __m128i val;
599
- memcpy (&val, mem, sizeof (uint64_t ));
597
+ __m128i val;
598
+ memcpy (&val, mem, sizeof (uint64_t ));
600
599
#endif
601
- return _mm256_sub_epi32 (_mm256_set1_epi8 (0 ), _mm256_cvtepu8_epi32 (val));
602
- }
603
- else XSIMD_IF_CONSTEXPR (sizeof (T) == 8 )
604
- {
605
- uint32_t tmp;
606
- memcpy (&tmp, mem, sizeof (tmp));
607
- return _mm256_sub_epi64 (_mm256_set1_epi8 (0 ), _mm256_cvtepu8_epi64 (_mm_cvtsi32_si128 (tmp)));
608
- }
609
- else
610
- {
611
- assert (false && " unsupported arch/op combination" );
612
- return __m256i {};
613
- }
600
+ return { _mm256_sub_epi32 (_mm256_set1_epi8 (0 ), _mm256_cvtepu8_epi32 (val)) };
601
+ }
602
+ else XSIMD_IF_CONSTEXPR (sizeof (T) == 8 )
603
+ {
604
+ uint32_t tmp;
605
+ memcpy (&tmp, mem, sizeof (tmp));
606
+ return { _mm256_sub_epi64 (_mm256_set1_epi8 (0 ), _mm256_cvtepu8_epi64 (_mm_cvtsi32_si128 (tmp))) };
607
+ }
608
+ else
609
+ {
610
+ assert (false && " unsupported arch/op combination" );
611
+ return {};
614
612
}
615
- }
616
-
617
- template <class T , class A , class = typename std::enable_if<std::is_integral<T>::value, void >::type>
618
- XSIMD_INLINE batch_bool<T, A> load_unaligned (bool const * mem, batch_bool<T, A>, requires_arch<avx2>) noexcept
619
- {
620
- return batch_bool<T, A>(detail::load_bool_avx2<T>(mem));
621
613
}
622
614
623
615
template <class A >
624
- XSIMD_INLINE batch_bool<float , A> load_unaligned (bool const * mem, batch_bool<float , A>, requires_arch<avx2>) noexcept
616
+ XSIMD_INLINE batch_bool<float , A> load_unaligned (bool const * mem, batch_bool<float , A>, requires_arch<avx2> r ) noexcept
625
617
{
626
- return batch_bool<float , A>( _mm256_castsi256_ps (detail::load_bool_avx2< float >(mem))) ;
618
+ return { load_unaligned (mem, batch_bool<uint32_t , A>{}, r) } ;
627
619
}
628
620
629
621
template <class A >
630
- XSIMD_INLINE batch_bool<double , A> load_unaligned (bool const * mem, batch_bool<double , A>, requires_arch<avx2>) noexcept
622
+ XSIMD_INLINE batch_bool<double , A> load_unaligned (bool const * mem, batch_bool<double , A>, requires_arch<avx2> r ) noexcept
631
623
{
632
- return batch_bool<double , A>( _mm256_castsi256_pd (detail::load_bool_avx2< double >(mem))) ;
624
+ return { load_unaligned (mem, batch_bool<uint64_t , A>{}, r) } ;
633
625
}
634
626
635
627
// mask
0 commit comments