Skip to content

Commit 005f629

Browse files
authored
Add {load,store}_unaligned for batch_bool on x86 (#1172)
* Add `{load,store}_unaligned` for `batch_bool` These work around GCC not being able to optimize the baseline implementations to SIMD operations, and compilers in general not being able to know that the vector representation is a bitmask of all 0 or all 1, thus integer 0 or -1. Also added some more robust tests for bool load/store that test for bitwise correctness (either 0 or 1).
1 parent d6150b8 commit 005f629

File tree

5 files changed

+314
-27
lines changed

5 files changed

+314
-27
lines changed

include/xsimd/arch/xsimd_avx2.hpp

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -570,6 +570,68 @@ namespace xsimd
570570
batch_type imag = _mm256_permute4x64_pd(_mm256_unpackhi_pd(hi, lo), _MM_SHUFFLE(3, 1, 2, 0));
571571
return { real, imag };
572572
}
573+
574+
// load_unaligned<batch_bool>
575+
namespace detail
576+
{
577+
template <class T>
578+
XSIMD_INLINE __m256i load_bool_avx2(bool const* mem) noexcept
579+
{
580+
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
581+
{
582+
return _mm256_sub_epi8(_mm256_set1_epi8(0), _mm256_loadu_si256((__m256i const*)mem));
583+
}
584+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
585+
{
586+
auto bpack = _mm_loadu_si128((__m128i const*)mem);
587+
return _mm256_sub_epi16(_mm256_set1_epi8(0), _mm256_cvtepu8_epi16(bpack));
588+
}
589+
// GCC <12 have missing or buggy unaligned load intrinsics; use memcpy to work around this.
590+
// GCC/Clang/MSVC will turn it into the correct load.
591+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
592+
{
593+
#if defined(__x86_64__)
594+
uint64_t tmp;
595+
memcpy(&tmp, mem, sizeof(tmp));
596+
auto val = _mm_cvtsi64_si128(tmp);
597+
#else
598+
__m128i val;
599+
memcpy(&val, mem, sizeof(uint64_t));
600+
#endif
601+
return _mm256_sub_epi32(_mm256_set1_epi8(0), _mm256_cvtepu8_epi32(val));
602+
}
603+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
604+
{
605+
uint32_t tmp;
606+
memcpy(&tmp, mem, sizeof(tmp));
607+
return _mm256_sub_epi64(_mm256_set1_epi8(0), _mm256_cvtepu8_epi64(_mm_cvtsi32_si128(tmp)));
608+
}
609+
else
610+
{
611+
assert(false && "unsupported arch/op combination");
612+
return __m256i {};
613+
}
614+
}
615+
}
616+
617+
template <class T, class A, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
618+
XSIMD_INLINE batch_bool<T, A> load_unaligned(bool const* mem, batch_bool<T, A>, requires_arch<avx2>) noexcept
619+
{
620+
return batch_bool<T, A>(detail::load_bool_avx2<T>(mem));
621+
}
622+
623+
template <class A>
624+
XSIMD_INLINE batch_bool<float, A> load_unaligned(bool const* mem, batch_bool<float, A>, requires_arch<avx2>) noexcept
625+
{
626+
return batch_bool<float, A>(_mm256_castsi256_ps(detail::load_bool_avx2<float>(mem)));
627+
}
628+
629+
template <class A>
630+
XSIMD_INLINE batch_bool<double, A> load_unaligned(bool const* mem, batch_bool<double, A>, requires_arch<avx2>) noexcept
631+
{
632+
return batch_bool<double, A>(_mm256_castsi256_pd(detail::load_bool_avx2<double>(mem)));
633+
}
634+
573635
// mask
574636
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
575637
XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<avx2>) noexcept
@@ -923,6 +985,66 @@ namespace xsimd
923985
return _mm256_or_si256(y, w);
924986
}
925987

988+
// store<batch_bool>
989+
namespace detail
990+
{
991+
template <class T>
992+
XSIMD_INLINE void store_bool_avx2(__m256i b, bool* mem, T) noexcept
993+
{
994+
// GCC <12 have missing or buggy unaligned store intrinsics; use memcpy to work around this.
995+
// GCC/Clang/MSVC will turn it into the correct store.
996+
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
997+
{
998+
// negate mask to convert to 0 or 1
999+
auto val = _mm256_sub_epi8(_mm256_set1_epi8(0), b);
1000+
memcpy(mem, &val, sizeof(val));
1001+
return;
1002+
}
1003+
1004+
auto b_hi = _mm256_extractf128_si256(b, 1);
1005+
auto b_lo = _mm256_castsi256_si128(b);
1006+
XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1007+
{
1008+
auto val = _mm_sub_epi8(_mm_set1_epi8(0), _mm_packs_epi16(b_lo, b_hi));
1009+
memcpy(mem, &val, sizeof(val));
1010+
}
1011+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1012+
{
1013+
auto pack_16 = _mm_packs_epi32(b_lo, b_hi);
1014+
auto val = _mm_sub_epi8(_mm_set1_epi8(0), _mm_packs_epi16(pack_16, pack_16));
1015+
#if defined(__x86_64__)
1016+
auto val_lo = _mm_cvtsi128_si64(val);
1017+
memcpy(mem, &val_lo, sizeof(val_lo));
1018+
#else
1019+
memcpy(mem, &val, sizeof(uint64_t));
1020+
#endif
1021+
}
1022+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1023+
{
1024+
const auto bmask = _mm_set_epi8(
1025+
-1, -1, -1, -1, -1, -1, -1, -1,
1026+
-1, -1, -1, -1, -1, -1, 8, 0);
1027+
auto pack = _mm_unpacklo_epi16(_mm_shuffle_epi8(b_lo, bmask), _mm_shuffle_epi8(b_hi, bmask));
1028+
uint32_t val = _mm_cvtsi128_si32(_mm_sub_epi8(_mm_set1_epi8(0), pack));
1029+
memcpy(mem, &val, sizeof(val));
1030+
}
1031+
else
1032+
{
1033+
assert(false && "unsupported arch/op combination");
1034+
}
1035+
}
1036+
1037+
XSIMD_INLINE __m256i avx_to_i(__m256 x) { return _mm256_castps_si256(x); }
1038+
XSIMD_INLINE __m256i avx_to_i(__m256d x) { return _mm256_castpd_si256(x); }
1039+
XSIMD_INLINE __m256i avx_to_i(__m256i x) { return x; }
1040+
}
1041+
1042+
template <class T, class A>
1043+
XSIMD_INLINE void store(batch_bool<T, A> b, bool* mem, requires_arch<avx2>) noexcept
1044+
{
1045+
detail::store_bool_avx2(detail::avx_to_i(b), mem, T {});
1046+
}
1047+
9261048
// ssub
9271049
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
9281050
XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept

include/xsimd/arch/xsimd_sse2.hpp

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1697,6 +1697,60 @@ namespace xsimd
16971697
}
16981698
}
16991699

1700+
// store<batch_bool>
1701+
namespace detail
1702+
{
1703+
template <class T>
1704+
XSIMD_INLINE void store_bool_sse2(__m128i b, bool* mem, T) noexcept
1705+
{
1706+
// GCC <12 have missing or buggy unaligned store intrinsics; use memcpy to work around this.
1707+
// GCC/Clang/MSVC will turn it into the correct store.
1708+
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1709+
{
1710+
// negate mask to convert to 0 or 1
1711+
auto val = _mm_sub_epi8(_mm_set1_epi8(0), b);
1712+
memcpy(mem, &val, sizeof(val));
1713+
}
1714+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1715+
{
1716+
auto val = _mm_sub_epi8(_mm_set1_epi8(0), _mm_packs_epi16(b, b));
1717+
#if defined(__x86_64__)
1718+
auto val_lo = _mm_cvtsi128_si64(val);
1719+
memcpy(mem, &val_lo, sizeof(val_lo));
1720+
#else
1721+
memcpy(mem, &val, sizeof(uint64_t));
1722+
#endif
1723+
}
1724+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1725+
{
1726+
auto pack_16 = _mm_packs_epi32(b, b);
1727+
uint32_t val = _mm_cvtsi128_si32(_mm_sub_epi8(_mm_set1_epi8(0), _mm_packs_epi16(pack_16, pack_16)));
1728+
memcpy(mem, &val, sizeof(val));
1729+
}
1730+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1731+
{
1732+
auto pack_32 = _mm_packs_epi32(b, b);
1733+
auto pack_16 = _mm_packs_epi32(pack_32, pack_32);
1734+
uint16_t val = _mm_cvtsi128_si32(_mm_sub_epi8(_mm_set1_epi8(0), _mm_packs_epi16(pack_16, pack_16)));
1735+
memcpy(mem, &val, sizeof(val));
1736+
}
1737+
else
1738+
{
1739+
assert(false && "unsupported arch/op combination");
1740+
}
1741+
}
1742+
1743+
XSIMD_INLINE __m128i sse_to_i(__m128 x) { return _mm_castps_si128(x); }
1744+
XSIMD_INLINE __m128i sse_to_i(__m128d x) { return _mm_castpd_si128(x); }
1745+
XSIMD_INLINE __m128i sse_to_i(__m128i x) { return x; }
1746+
}
1747+
1748+
template <class T, class A>
1749+
XSIMD_INLINE void store(batch_bool<T, A> b, bool* mem, requires_arch<sse2>) noexcept
1750+
{
1751+
detail::store_bool_sse2(detail::sse_to_i(b), mem, T {});
1752+
}
1753+
17001754
// store_aligned
17011755
template <class A>
17021756
XSIMD_INLINE void store_aligned(float* mem, batch<float, A> const& self, requires_arch<sse2>) noexcept

include/xsimd/arch/xsimd_sse3.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,6 @@ namespace xsimd
5959
__m128 tmp2 = _mm_mul_ps(tmp1, _mm_movehdup_ps(tmp1));
6060
return _mm_cvtss_f32(tmp2);
6161
}
62-
6362
}
6463

6564
}

include/xsimd/arch/xsimd_sse4_1.hpp

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,68 @@ namespace xsimd
122122
}
123123
}
124124

125+
// load_unaligned<batch_bool>
126+
namespace detail
127+
{
128+
template <class T>
129+
XSIMD_INLINE __m128i load_bool_sse4_1(bool const* mem) noexcept
130+
{
131+
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
132+
{
133+
return _mm_sub_epi8(_mm_set1_epi8(0), _mm_loadu_si128((__m128i const*)mem));
134+
}
135+
// GCC <12 have missing or buggy unaligned load intrinsics; use memcpy to work around this.
136+
// GCC/Clang/MSVC will turn it into the correct load.
137+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
138+
{
139+
#if defined(__x86_64__)
140+
uint64_t tmp;
141+
memcpy(&tmp, mem, sizeof(tmp));
142+
auto val = _mm_cvtsi64_si128(tmp);
143+
#else
144+
__m128i val;
145+
memcpy(&val, mem, sizeof(uint64_t));
146+
#endif
147+
return _mm_sub_epi16(_mm_set1_epi8(0), _mm_cvtepu8_epi16(val));
148+
}
149+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
150+
{
151+
uint32_t tmp;
152+
memcpy(&tmp, mem, sizeof(tmp));
153+
return _mm_sub_epi32(_mm_set1_epi8(0), _mm_cvtepu8_epi32(_mm_cvtsi32_si128(tmp)));
154+
}
155+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
156+
{
157+
uint16_t tmp;
158+
memcpy(&tmp, mem, sizeof(tmp));
159+
return _mm_sub_epi64(_mm_set1_epi8(0), _mm_cvtepu8_epi64(_mm_cvtsi32_si128((uint32_t)tmp)));
160+
}
161+
else
162+
{
163+
assert(false && "unsupported arch/op combination");
164+
return __m128i {};
165+
}
166+
}
167+
}
168+
169+
template <class T, class A, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
170+
XSIMD_INLINE batch_bool<T, A> load_unaligned(bool const* mem, batch_bool<T, A>, requires_arch<sse4_1>) noexcept
171+
{
172+
return batch_bool<T, A>(detail::load_bool_sse4_1<T>(mem));
173+
}
174+
175+
template <class A>
176+
XSIMD_INLINE batch_bool<float, A> load_unaligned(bool const* mem, batch_bool<float, A>, requires_arch<sse4_1>) noexcept
177+
{
178+
return batch_bool<float, A>(_mm_castsi128_ps(detail::load_bool_sse4_1<float>(mem)));
179+
}
180+
181+
template <class A>
182+
XSIMD_INLINE batch_bool<double, A> load_unaligned(bool const* mem, batch_bool<double, A>, requires_arch<sse4_1>) noexcept
183+
{
184+
return batch_bool<double, A>(_mm_castsi128_pd(detail::load_bool_sse4_1<double>(mem)));
185+
}
186+
125187
// max
126188
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
127189
XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept

0 commit comments

Comments
 (0)