Skip to content

Commit 724eb36

Browse files
Simplify load implementation from #1172
- Split some implementation that lived in sse4_1 while sse2 was a good home - Avoid auxiliary function
1 parent c377859 commit 724eb36

File tree

3 files changed

+89
-89
lines changed

3 files changed

+89
-89
lines changed

include/xsimd/arch/xsimd_avx2.hpp

Lines changed: 36 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -572,64 +572,56 @@ namespace xsimd
572572
}
573573

574574
// load_unaligned<batch_bool>
575-
namespace detail
575+
576+
template <class T, class A, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
577+
XSIMD_INLINE batch_bool<T, A> load_unaligned(bool const* mem, batch_bool<T, A>, requires_arch<avx2>) noexcept
576578
{
577-
template <class T>
578-
XSIMD_INLINE __m256i load_bool_avx2(bool const* mem) noexcept
579+
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
580+
{
581+
return { _mm256_sub_epi8(_mm256_set1_epi8(0), _mm256_loadu_si256((__m256i const*)mem)) };
582+
}
583+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
584+
{
585+
auto bpack = _mm_loadu_si128((__m128i const*)mem);
586+
return { _mm256_sub_epi16(_mm256_set1_epi8(0), _mm256_cvtepu8_epi16(bpack)) };
587+
}
588+
// GCC <12 have missing or buggy unaligned load intrinsics; use memcpy to work around this.
589+
// GCC/Clang/MSVC will turn it into the correct load.
590+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
579591
{
580-
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
581-
{
582-
return _mm256_sub_epi8(_mm256_set1_epi8(0), _mm256_loadu_si256((__m256i const*)mem));
583-
}
584-
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
585-
{
586-
auto bpack = _mm_loadu_si128((__m128i const*)mem);
587-
return _mm256_sub_epi16(_mm256_set1_epi8(0), _mm256_cvtepu8_epi16(bpack));
588-
}
589-
// GCC <12 have missing or buggy unaligned load intrinsics; use memcpy to work around this.
590-
// GCC/Clang/MSVC will turn it into the correct load.
591-
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
592-
{
593592
#if defined(__x86_64__)
594-
uint64_t tmp;
595-
memcpy(&tmp, mem, sizeof(tmp));
596-
auto val = _mm_cvtsi64_si128(tmp);
593+
uint64_t tmp;
594+
memcpy(&tmp, mem, sizeof(tmp));
595+
auto val = _mm_cvtsi64_si128(tmp);
597596
#else
598-
__m128i val;
599-
memcpy(&val, mem, sizeof(uint64_t));
597+
__m128i val;
598+
memcpy(&val, mem, sizeof(uint64_t));
600599
#endif
601-
return _mm256_sub_epi32(_mm256_set1_epi8(0), _mm256_cvtepu8_epi32(val));
602-
}
603-
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
604-
{
605-
uint32_t tmp;
606-
memcpy(&tmp, mem, sizeof(tmp));
607-
return _mm256_sub_epi64(_mm256_set1_epi8(0), _mm256_cvtepu8_epi64(_mm_cvtsi32_si128(tmp)));
608-
}
609-
else
610-
{
611-
assert(false && "unsupported arch/op combination");
612-
return __m256i {};
613-
}
600+
return { _mm256_sub_epi32(_mm256_set1_epi8(0), _mm256_cvtepu8_epi32(val)) };
601+
}
602+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
603+
{
604+
uint32_t tmp;
605+
memcpy(&tmp, mem, sizeof(tmp));
606+
return { _mm256_sub_epi64(_mm256_set1_epi8(0), _mm256_cvtepu8_epi64(_mm_cvtsi32_si128(tmp))) };
607+
}
608+
else
609+
{
610+
assert(false && "unsupported arch/op combination");
611+
return {};
614612
}
615-
}
616-
617-
template <class T, class A, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
618-
XSIMD_INLINE batch_bool<T, A> load_unaligned(bool const* mem, batch_bool<T, A>, requires_arch<avx2>) noexcept
619-
{
620-
return batch_bool<T, A>(detail::load_bool_avx2<T>(mem));
621613
}
622614

623615
template <class A>
624-
XSIMD_INLINE batch_bool<float, A> load_unaligned(bool const* mem, batch_bool<float, A>, requires_arch<avx2>) noexcept
616+
XSIMD_INLINE batch_bool<float, A> load_unaligned(bool const* mem, batch_bool<float, A>, requires_arch<avx2> r) noexcept
625617
{
626-
return batch_bool<float, A>(_mm256_castsi256_ps(detail::load_bool_avx2<float>(mem)));
618+
return { load_unaligned(mem, batch_bool<uint32_t, A>{}, r) };
627619
}
628620

629621
template <class A>
630-
XSIMD_INLINE batch_bool<double, A> load_unaligned(bool const* mem, batch_bool<double, A>, requires_arch<avx2>) noexcept
622+
XSIMD_INLINE batch_bool<double, A> load_unaligned(bool const* mem, batch_bool<double, A>, requires_arch<avx2> r) noexcept
631623
{
632-
return batch_bool<double, A>(_mm256_castsi256_pd(detail::load_bool_avx2<double>(mem)));
624+
return { load_unaligned(mem, batch_bool<uint64_t, A>{}, r) };
633625
}
634626

635627
// mask

include/xsimd/arch/xsimd_sse2.hpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1043,6 +1043,26 @@ namespace xsimd
10431043
return _mm_loadu_pd(mem);
10441044
}
10451045

1046+
// load batch_bool
1047+
1048+
template <class A>
1049+
XSIMD_INLINE batch_bool<char, A> load_unaligned(bool const* mem, batch_bool<char, A>, requires_arch<sse2>) noexcept
1050+
{
1051+
return _mm_sub_epi8(_mm_set1_epi8(0), _mm_loadu_si128((__m128i const*)mem));
1052+
}
1053+
1054+
template <class A>
1055+
XSIMD_INLINE batch_bool<unsigned char, A> load_unaligned(bool const* mem, batch_bool<unsigned char, A>, requires_arch<sse2> r) noexcept
1056+
{
1057+
return { load_unaligned(mem, batch_bool<char, A> {}, r).data };
1058+
}
1059+
1060+
template <class A>
1061+
XSIMD_INLINE batch_bool<signed char, A> load_unaligned(bool const* mem, batch_bool<signed char, A>, requires_arch<sse2> r) noexcept
1062+
{
1063+
return { load_unaligned(mem, batch_bool<char, A> {}, r).data };
1064+
}
1065+
10461066
// load_complex
10471067
namespace detail
10481068
{

include/xsimd/arch/xsimd_sse4_1.hpp

Lines changed: 33 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -123,65 +123,53 @@ namespace xsimd
123123
}
124124

125125
// load_unaligned<batch_bool>
126-
namespace detail
126+
127+
template <class T, class A, class = typename std::enable_if<(std::is_integral<T>::value && sizeof(T) > 1), void>::type>
128+
XSIMD_INLINE batch_bool<T, A> load_unaligned(bool const* mem, batch_bool<T, A>, requires_arch<sse4_1>) noexcept
127129
{
128-
template <class T>
129-
XSIMD_INLINE __m128i load_bool_sse4_1(bool const* mem) noexcept
130+
// GCC <12 have missing or buggy unaligned load intrinsics; use memcpy to work around this.
131+
// GCC/Clang/MSVC will turn it into the correct load.
132+
XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
130133
{
131-
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
132-
{
133-
return _mm_sub_epi8(_mm_set1_epi8(0), _mm_loadu_si128((__m128i const*)mem));
134-
}
135-
// GCC <12 have missing or buggy unaligned load intrinsics; use memcpy to work around this.
136-
// GCC/Clang/MSVC will turn it into the correct load.
137-
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
138-
{
139134
#if defined(__x86_64__)
140-
uint64_t tmp;
141-
memcpy(&tmp, mem, sizeof(tmp));
142-
auto val = _mm_cvtsi64_si128(tmp);
135+
uint64_t tmp;
136+
memcpy(&tmp, mem, sizeof(tmp));
137+
auto val = _mm_cvtsi64_si128(tmp);
143138
#else
144-
__m128i val;
145-
memcpy(&val, mem, sizeof(uint64_t));
139+
__m128i val;
140+
memcpy(&val, mem, sizeof(uint64_t));
146141
#endif
147-
return _mm_sub_epi16(_mm_set1_epi8(0), _mm_cvtepu8_epi16(val));
148-
}
149-
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
150-
{
151-
uint32_t tmp;
152-
memcpy(&tmp, mem, sizeof(tmp));
153-
return _mm_sub_epi32(_mm_set1_epi8(0), _mm_cvtepu8_epi32(_mm_cvtsi32_si128(tmp)));
154-
}
155-
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
156-
{
157-
uint16_t tmp;
158-
memcpy(&tmp, mem, sizeof(tmp));
159-
return _mm_sub_epi64(_mm_set1_epi8(0), _mm_cvtepu8_epi64(_mm_cvtsi32_si128((uint32_t)tmp)));
160-
}
161-
else
162-
{
163-
assert(false && "unsupported arch/op combination");
164-
return __m128i {};
165-
}
142+
return { _mm_sub_epi16(_mm_set1_epi8(0), _mm_cvtepu8_epi16(val)) };
143+
}
144+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
145+
{
146+
uint32_t tmp;
147+
memcpy(&tmp, mem, sizeof(tmp));
148+
return { _mm_sub_epi32(_mm_set1_epi8(0), _mm_cvtepu8_epi32(_mm_cvtsi32_si128(tmp))) };
149+
}
150+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
151+
{
152+
uint16_t tmp;
153+
memcpy(&tmp, mem, sizeof(tmp));
154+
return { _mm_sub_epi64(_mm_set1_epi8(0), _mm_cvtepu8_epi64(_mm_cvtsi32_si128((uint32_t)tmp))) };
155+
}
156+
else
157+
{
158+
assert(false && "unsupported arch/op combination");
159+
return __m128i {};
166160
}
167-
}
168-
169-
template <class T, class A, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
170-
XSIMD_INLINE batch_bool<T, A> load_unaligned(bool const* mem, batch_bool<T, A>, requires_arch<sse4_1>) noexcept
171-
{
172-
return batch_bool<T, A>(detail::load_bool_sse4_1<T>(mem));
173161
}
174162

175163
template <class A>
176-
XSIMD_INLINE batch_bool<float, A> load_unaligned(bool const* mem, batch_bool<float, A>, requires_arch<sse4_1>) noexcept
164+
XSIMD_INLINE batch_bool<float, A> load_unaligned(bool const* mem, batch_bool<float, A>, requires_arch<sse4_1> r) noexcept
177165
{
178-
return batch_bool<float, A>(_mm_castsi128_ps(detail::load_bool_sse4_1<float>(mem)));
166+
return { _mm_castsi128_ps(load_unaligned(mem, batch_bool<uint32_t, A> {}, r)) };
179167
}
180168

181169
template <class A>
182-
XSIMD_INLINE batch_bool<double, A> load_unaligned(bool const* mem, batch_bool<double, A>, requires_arch<sse4_1>) noexcept
170+
XSIMD_INLINE batch_bool<double, A> load_unaligned(bool const* mem, batch_bool<double, A>, requires_arch<sse4_1> r) noexcept
183171
{
184-
return batch_bool<double, A>(_mm_castsi128_pd(detail::load_bool_sse4_1<double>(mem)));
172+
return { _mm_castsi128_pd(load_unaligned(mem, batch_bool<uint64_t, A> {}, r)) };
185173
}
186174

187175
// max

0 commit comments

Comments
 (0)