@@ -316,18 +316,66 @@ namespace xsimd
316
316
}
317
317
318
318
// load
319
- template <class A , class T , class = typename std::enable_if<batch_bool<T, A >::size == 64 , void >::type>
319
+ template <class A , class T , class = typename std::enable_if<std::is_integral<T >::value , void >::type>
320
320
XSIMD_INLINE batch_bool<T, A> load_unaligned (bool const * mem, batch_bool<T, A>, requires_arch<avx512bw>) noexcept
321
321
{
322
- __m512i bool_val = _mm512_loadu_si512 ((__m512i const *)mem);
323
- return _mm512_cmpgt_epu8_mask (bool_val, _mm512_setzero_si512 ());
322
+ using mask_type = typename batch_bool<T, A>::register_type;
323
+ XSIMD_IF_CONSTEXPR (batch_bool<T, A>::size == 64 )
324
+ {
325
+ __m512i bool_val = _mm512_loadu_si512 ((__m512i const *)mem);
326
+ return (mask_type)_mm512_cmpgt_epu8_mask (bool_val, _mm512_setzero_si512 ());
327
+ }
328
+ else XSIMD_IF_CONSTEXPR (batch_bool<T, A>::size == 32 )
329
+ {
330
+ __m256i bpack = _mm256_loadu_si256 ((__m256i const *)mem);
331
+ return (mask_type)_mm512_cmpgt_epu16_mask (_mm512_cvtepu8_epi16 (bpack), _mm512_setzero_si512 ());
332
+ }
333
+ else XSIMD_IF_CONSTEXPR (batch_bool<T, A>::size == 16 )
334
+ {
335
+ __m128i bpack = _mm_loadu_si128 ((__m128i const *)mem);
336
+ return (mask_type)_mm512_cmpgt_epu32_mask (_mm512_cvtepu8_epi32 (bpack), _mm512_setzero_si512 ());
337
+ }
338
+ else XSIMD_IF_CONSTEXPR (batch_bool<T, A>::size == 8 )
339
+ {
340
+ __m128i bpack = _mm_loadl_epi64 ((__m128i const *)mem);
341
+ return (mask_type)_mm512_cmpgt_epu64_mask (_mm512_cvtepu8_epi64 (bpack), _mm512_setzero_si512 ());
342
+ }
343
+ else
344
+ {
345
+ assert (false && " unexpected batch size" );
346
+ return {};
347
+ }
324
348
}
325
349
326
- template <class A , class T , class = typename std::enable_if<batch_bool<T, A >::size == 64 , void >::type>
350
+ template <class A , class T , class = typename std::enable_if<std::is_integral<T >::value , void >::type>
327
351
XSIMD_INLINE batch_bool<T, A> load_aligned (bool const * mem, batch_bool<T, A>, requires_arch<avx512bw>) noexcept
328
352
{
329
- __m512i bool_val = _mm512_load_si512 ((__m512i const *)mem);
330
- return _mm512_cmpgt_epu8_mask (bool_val, _mm512_setzero_si512 ());
353
+ using mask_type = typename batch_bool<T, A>::register_type;
354
+ XSIMD_IF_CONSTEXPR (batch_bool<T, A>::size == 64 )
355
+ {
356
+ __m512i bool_val = _mm512_load_si512 ((__m512i const *)mem);
357
+ return (mask_type)_mm512_cmpgt_epu8_mask (bool_val, _mm512_setzero_si512 ());
358
+ }
359
+ else XSIMD_IF_CONSTEXPR (batch_bool<T, A>::size == 32 )
360
+ {
361
+ __m256i bpack = _mm256_load_si256 ((__m256i const *)mem);
362
+ return (mask_type)_mm512_cmpgt_epu16_mask (_mm512_cvtepu8_epi16 (bpack), _mm512_setzero_si512 ());
363
+ }
364
+ else XSIMD_IF_CONSTEXPR (batch_bool<T, A>::size == 16 )
365
+ {
366
+ __m128i bpack = _mm_load_si128 ((__m128i const *)mem);
367
+ return (mask_type)_mm512_cmpgt_epu32_mask (_mm512_cvtepu8_epi32 (bpack), _mm512_setzero_si512 ());
368
+ }
369
+ else XSIMD_IF_CONSTEXPR (batch_bool<T, A>::size == 8 )
370
+ {
371
+ __m128i bpack = _mm_loadl_epi64 ((__m128i const *)mem);
372
+ return (mask_type)_mm512_cmpgt_epu64_mask (_mm512_cvtepu8_epi64 (bpack), _mm512_setzero_si512 ());
373
+ }
374
+ else
375
+ {
376
+ assert (false && " unexpected batch size" );
377
+ return {};
378
+ }
331
379
}
332
380
333
381
// max
0 commit comments