Skip to content

Commit 9afc465

Browse files
Extend #1172 approach to avx512
1 parent b56e1bf commit 9afc465

File tree

1 file changed

+52
-6
lines changed

1 file changed

+52
-6
lines changed

include/xsimd/arch/xsimd_avx512bw.hpp

Lines changed: 52 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -316,18 +316,64 @@ namespace xsimd
316316
}
317317

318318
// load
319-
template <class A, class T, class = typename std::enable_if<batch_bool<T, A>::size == 64, void>::type>
319+
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
320320
XSIMD_INLINE batch_bool<T, A> load_unaligned(bool const* mem, batch_bool<T, A>, requires_arch<avx512bw>) noexcept
321321
{
322-
__m512i bool_val = _mm512_loadu_si512((__m512i const*)mem);
323-
return _mm512_cmpgt_epu8_mask(bool_val, _mm512_setzero_si512());
322+
XSIMD_IF_CONSTEXPR(batch_bool<T, A>::size == 64)
323+
{
324+
__m512i bool_val = _mm512_loadu_si512((__m512i const*)mem);
325+
return _mm512_cmpgt_epu8_mask(bool_val, _mm512_setzero_si512());
326+
}
327+
else XSIMD_IF_CONSTEXPR(batch_bool<T, A>::size == 32)
328+
{
329+
__m256i bpack = _mm256_loadu_si256((__m256i const*)mem);
330+
return { _mm512_sub_epi16(_mm512_set1_epi8(0), _mm512_cvtepu8_epi16(bpack)) };
331+
}
332+
else XSIMD_IF_CONSTEXPR(batch_bool<T, A>::size == 16)
333+
{
334+
__m128i bpack = _mm_loadu_si128((__m128i const*)mem);
335+
return { _mm512_sub_epi16(_mm512_set1_epi8(0), _mm512_cvtepu8_epi32(bpack)) };
336+
}
337+
else XSIMD_IF_CONSTEXPR(batch_bool<T, A>::size == 8)
338+
{
339+
__m128i bpack = _mm_loadl_epi64((__m128i const*)mem);
340+
return { _mm512_sub_epi16(_mm512_set1_epi8(0), _mm512_cvtepu8_epi64(bpack)) };
341+
}
342+
else
343+
{
344+
assert(false && "unexpected batch size");
345+
return {};
346+
}
324347
}
325348

326-
template <class A, class T, class = typename std::enable_if<batch_bool<T, A>::size == 64, void>::type>
349+
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
327350
XSIMD_INLINE batch_bool<T, A> load_aligned(bool const* mem, batch_bool<T, A>, requires_arch<avx512bw>) noexcept
328351
{
329-
__m512i bool_val = _mm512_load_si512((__m512i const*)mem);
330-
return _mm512_cmpgt_epu8_mask(bool_val, _mm512_setzero_si512());
352+
XSIMD_IF_CONSTEXPR(batch_bool<T, A>::size == 64)
353+
{
354+
__m512i bool_val = _mm512_load_si512((__m512i const*)mem);
355+
return _mm512_cmpgt_epu8_mask(bool_val, _mm512_setzero_si512());
356+
}
357+
else XSIMD_IF_CONSTEXPR(batch_bool<T, A>::size == 32)
358+
{
359+
__m256i bpack = _mm256_load_si256((__m256i const*)mem);
360+
return { _mm512_sub_epi16(_mm512_set1_epi8(0), _mm512_cvtepu8_epi16(bpack)) };
361+
}
362+
else XSIMD_IF_CONSTEXPR(batch_bool<T, A>::size == 16)
363+
{
364+
__m128i bpack = _mm_load_si128((__m128i const*)mem);
365+
return { _mm512_sub_epi16(_mm512_set1_epi8(0), _mm512_cvtepu8_epi32(bpack)) };
366+
}
367+
else XSIMD_IF_CONSTEXPR(batch_bool<T, A>::size == 8)
368+
{
369+
__m128i bpack = _mm_loadl_epi64((__m128i const*)mem);
370+
return { _mm512_sub_epi16(_mm512_set1_epi8(0), _mm512_cvtepu8_epi64(bpack)) };
371+
}
372+
else
373+
{
374+
assert(false && "unexpected batch size");
375+
return {};
376+
}
331377
}
332378

333379
// max

0 commit comments

Comments
 (0)