@@ -316,18 +316,64 @@ namespace xsimd
316
316
}
317
317
318
318
// load
319
- template <class A , class T , class = typename std::enable_if<batch_bool<T, A >::size == 64 , void >::type>
319
+ template <class A , class T , class = typename std::enable_if<std::is_integral<T >::value , void >::type>
320
320
XSIMD_INLINE batch_bool<T, A> load_unaligned (bool const * mem, batch_bool<T, A>, requires_arch<avx512bw>) noexcept
321
321
{
322
- __m512i bool_val = _mm512_loadu_si512 ((__m512i const *)mem);
323
- return _mm512_cmpgt_epu8_mask (bool_val, _mm512_setzero_si512 ());
322
+ XSIMD_IF_CONSTEXPR (batch_bool<T, A>::size == 64 )
323
+ {
324
+ __m512i bool_val = _mm512_loadu_si512 ((__m512i const *)mem);
325
+ return _mm512_cmpgt_epu8_mask (bool_val, _mm512_setzero_si512 ());
326
+ }
327
+ else XSIMD_IF_CONSTEXPR (batch_bool<T, A>::size == 32 )
328
+ {
329
+ __m256i bpack = _mm256_loadu_si256 ((__m256i const *)mem);
330
+ return { _mm512_sub_epi16 (_mm512_set1_epi8 (0 ), _mm512_cvtepu8_epi16 (bpack)) };
331
+ }
332
+ else XSIMD_IF_CONSTEXPR (batch_bool<T, A>::size == 16 )
333
+ {
334
+ __m128i bpack = _mm_loadu_si128 ((__m128i const *)mem);
335
+ return { _mm512_sub_epi16 (_mm512_set1_epi8 (0 ), _mm512_cvtepu8_epi32 (bpack)) };
336
+ }
337
+ else XSIMD_IF_CONSTEXPR (batch_bool<T, A>::size == 8 )
338
+ {
339
+ __m128i bpack = _mm_loadl_epi64 ((__m128i const *)mem);
340
+ return { _mm512_sub_epi16 (_mm512_set1_epi8 (0 ), _mm512_cvtepu8_epi64 (bpack)) };
341
+ }
342
+ else
343
+ {
344
+ assert (false && " unexpected batch size" );
345
+ return {};
346
+ }
324
347
}
325
348
326
- template <class A , class T , class = typename std::enable_if<batch_bool<T, A >::size == 64 , void >::type>
349
+ template <class A , class T , class = typename std::enable_if<std::is_integral<T >::value , void >::type>
327
350
XSIMD_INLINE batch_bool<T, A> load_aligned (bool const * mem, batch_bool<T, A>, requires_arch<avx512bw>) noexcept
328
351
{
329
- __m512i bool_val = _mm512_load_si512 ((__m512i const *)mem);
330
- return _mm512_cmpgt_epu8_mask (bool_val, _mm512_setzero_si512 ());
352
+ XSIMD_IF_CONSTEXPR (batch_bool<T, A>::size == 64 )
353
+ {
354
+ __m512i bool_val = _mm512_load_si512 ((__m512i const *)mem);
355
+ return _mm512_cmpgt_epu8_mask (bool_val, _mm512_setzero_si512 ());
356
+ }
357
+ else XSIMD_IF_CONSTEXPR (batch_bool<T, A>::size == 32 )
358
+ {
359
+ __m256i bpack = _mm256_load_si256 ((__m256i const *)mem);
360
+ return { _mm512_sub_epi16 (_mm512_set1_epi8 (0 ), _mm512_cvtepu8_epi16 (bpack)) };
361
+ }
362
+ else XSIMD_IF_CONSTEXPR (batch_bool<T, A>::size == 16 )
363
+ {
364
+ __m128i bpack = _mm_load_si128 ((__m128i const *)mem);
365
+ return { _mm512_sub_epi16 (_mm512_set1_epi8 (0 ), _mm512_cvtepu8_epi32 (bpack)) };
366
+ }
367
+ else XSIMD_IF_CONSTEXPR (batch_bool<T, A>::size == 8 )
368
+ {
369
+ __m128i bpack = _mm_loadl_epi64 ((__m128i const *)mem);
370
+ return { _mm512_sub_epi16 (_mm512_set1_epi8 (0 ), _mm512_cvtepu8_epi64 (bpack)) };
371
+ }
372
+ else
373
+ {
374
+ assert (false && " unexpected batch size" );
375
+ return {};
376
+ }
331
377
}
332
378
333
379
// max
0 commit comments