Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

lib/x86/adler32: add an AVX-512 implementation #342

Merged
merged 4 commits into from
Feb 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,9 @@ jobs:
- run: >
cmake -B build -G "${{matrix.gen}}" -T ${{matrix.toolset}}
-A ${{matrix.vs}} -DLIBDEFLATE_BUILD_TESTS=1
-DCMAKE_C_FLAGS="/W4 /WX /DLIBDEFLATE_ENABLE_ASSERTIONS /IC:\vcpkg\packages\zlib_${{matrix.vcpkg}}\include"
-DCMAKE_C_FLAGS="/W4 /WX /DLIBDEFLATE_ENABLE_ASSERTIONS"
-DZLIB_LIBRARY=C:\vcpkg\packages\zlib_${{matrix.vcpkg}}\lib\zlib.lib
-DZLIB_INCLUDE_DIR=C:\vcpkg\packages\zlib_${{matrix.vcpkg}}\include
-DCMAKE_INSTALL_PREFIX=build\install
- run: cmake --build build --verbose --config Debug
- run: cmake --install build --verbose --config Debug
Expand Down
141 changes: 116 additions & 25 deletions lib/x86/adler32_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,19 @@
ADLER32_FINISH_VEC_CHUNK_128((s1), (s2), s1_128bit, s2_128bit); \
}

#define ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2) \
{ \
__m256i /* __v8su */ s1_256bit, s2_256bit; \
\
/* 512 => 256 bits */ \
s1_256bit = _mm256_add_epi32(_mm512_extracti64x4_epi64((v_s1), 0), \
_mm512_extracti64x4_epi64((v_s1), 1)); \
s2_256bit = _mm256_add_epi32(_mm512_extracti64x4_epi64((v_s2), 0), \
_mm512_extracti64x4_epi64((v_s2), 1)); \
\
ADLER32_FINISH_VEC_CHUNK_256((s1), (s2), s1_256bit, s2_256bit); \
}

/*
* This is a very silly partial workaround for gcc bug
* https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107892. The bug causes gcc to
Expand Down Expand Up @@ -105,15 +118,17 @@
static forceinline ATTRIBUTES void
adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2)
{
static const u16 _aligned_attribute(16) mults[4][16] = {
{ 32, 31, 30, 29, 28, 27, 26, 25 },
{ 24, 23, 22, 21, 20, 19, 18, 17 },
{ 16, 15, 14, 13, 12, 11, 10, 9 },
{ 8, 7, 6, 5, 4, 3, 2, 1 },
};
const __m128i /* __v8hu */ mults_a = _mm_load_si128((const __m128i *)mults[0]);
const __m128i /* __v8hu */ mults_b = _mm_load_si128((const __m128i *)mults[1]);
const __m128i /* __v8hu */ mults_c = _mm_load_si128((const __m128i *)mults[2]);
const __m128i /* __v8hu */ mults_d = _mm_load_si128((const __m128i *)mults[3]);
const __m128i zeroes = _mm_setzero_si128();
const __m128i /* __v8hu */ mults_a =
_mm_setr_epi16(32, 31, 30, 29, 28, 27, 26, 25);
const __m128i /* __v8hu */ mults_b =
_mm_setr_epi16(24, 23, 22, 21, 20, 19, 18, 17);
const __m128i /* __v8hu */ mults_c =
_mm_setr_epi16(16, 15, 14, 13, 12, 11, 10, 9);
const __m128i /* __v8hu */ mults_d =
_mm_setr_epi16(8, 7, 6, 5, 4, 3, 2, 1);

/* s1 counters: 32-bit, sum of bytes */
__m128i /* __v4su */ v_s1 = zeroes;
Expand Down Expand Up @@ -209,23 +224,21 @@ adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2)
static forceinline ATTRIBUTES void
adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2)
{
const __m256i zeroes = _mm256_setzero_si256();
/*
* Note, the multipliers have to be in this order because
* _mm256_unpack{lo,hi}_epi8 work on each 128-bit lane separately.
*/
const __m256i /* __v16hu */ mults_a =
_mm256_setr_epi16(64, 63, 62, 61, 60, 59, 58, 57,
48, 47, 46, 45, 44, 43, 42, 41);
const __m256i /* __v16hu */ mults_b =
_mm256_setr_epi16(56, 55, 54, 53, 52, 51, 50, 49,
40, 39, 38, 37, 36, 35, 34, 33);
const __m256i /* __v16hu */ mults_c =
_mm256_setr_epi16(32, 31, 30, 29, 28, 27, 26, 25,
16, 15, 14, 13, 12, 11, 10, 9);
const __m256i /* __v16hu */ mults_d =
_mm256_setr_epi16(24, 23, 22, 21, 20, 19, 18, 17,
8, 7, 6, 5, 4, 3, 2, 1);
static const u16 _aligned_attribute(32) mults[4][16] = {
{ 64, 63, 62, 61, 60, 59, 58, 57, 48, 47, 46, 45, 44, 43, 42, 41 },
{ 56, 55, 54, 53, 52, 51, 50, 49, 40, 39, 38, 37, 36, 35, 34, 33 },
{ 32, 31, 30, 29, 28, 27, 26, 25, 16, 15, 14, 13, 12, 11, 10, 9 },
{ 24, 23, 22, 21, 20, 19, 18, 17, 8, 7, 6, 5, 4, 3, 2, 1 },
};
const __m256i /* __v16hu */ mults_a = _mm256_load_si256((const __m256i *)mults[0]);
const __m256i /* __v16hu */ mults_b = _mm256_load_si256((const __m256i *)mults[1]);
const __m256i /* __v16hu */ mults_c = _mm256_load_si256((const __m256i *)mults[2]);
const __m256i /* __v16hu */ mults_d = _mm256_load_si256((const __m256i *)mults[3]);
const __m256i zeroes = _mm256_setzero_si256();
__m256i /* __v8su */ v_s1 = zeroes;
__m256i /* __v8su */ v_s2 = zeroes;
__m256i /* __v16hu */ v_byte_sums_a = zeroes;
Expand Down Expand Up @@ -263,14 +276,93 @@ adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2)
# include "../adler32_vec_template.h"
#endif /* HAVE_AVX2_INTRIN */

#if defined(adler32_avx2) && HAVE_AVX2_NATIVE
#define DEFAULT_IMPL adler32_avx2
#else
/*
* AVX512VNNI/AVX512BW implementation. Uses the dot product instruction
* vpdpbusd from AVX512VNNI and the vpsadbw instruction from AVX512BW.
* (vpdpbusd with ones could be used instead of vpsadbw, but it's slower.)
*
* We currently don't include an implementation using AVX512VNNI or AVX512BW
* alone, since CPUs with good AVX-512 implementations tend to have both.
*/
#if HAVE_AVX512VNNI_INTRIN && HAVE_AVX512BW_INTRIN
# define adler32_avx512vnni_avx512bw adler32_avx512vnni_avx512bw
# define FUNCNAME adler32_avx512vnni_avx512bw
# define FUNCNAME_CHUNK adler32_avx512vnni_avx512bw_chunk
# define IMPL_ALIGNMENT 64
# define IMPL_SEGMENT_LEN 128
# define IMPL_MAX_CHUNK_LEN (128 * (0xFFFF / 0xFF))
# if HAVE_AVX512VNNI_NATIVE && HAVE_AVX512BW_NATIVE
# define ATTRIBUTES
# else
# define ATTRIBUTES _target_attribute("avx512vnni,avx512bw")
# endif
# include <immintrin.h>
/*
* With clang in MSVC compatibility mode, immintrin.h incorrectly skips
* including some sub-headers.
*/
# if defined(__clang__) && defined(_MSC_VER)
# include <tmmintrin.h>
# include <smmintrin.h>
# include <wmmintrin.h>
# include <avxintrin.h>
# include <avx2intrin.h>
# include <avx512fintrin.h>
# include <avx512bwintrin.h>
# include <avx512vnniintrin.h>
# endif
static forceinline ATTRIBUTES void
adler32_avx512vnni_avx512bw_chunk(const __m512i *p, const __m512i *const end,
u32 *s1, u32 *s2)
{
static const u8 _aligned_attribute(64) mults[64] = {
64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,
48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33,
32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
};
const __m512i /* __v64qu */ mults_a = _mm512_load_si512((const __m512i *)mults);
const __m512i zeroes = _mm512_setzero_si512();
__m512i /* __v16su */ v_s1 = zeroes;
__m512i /* __v16su */ v_s1_sums_a = zeroes;
__m512i /* __v16su */ v_s1_sums_b = zeroes;
__m512i /* __v16su */ v_s2_a = zeroes;
__m512i /* __v16su */ v_s2_b = zeroes;

do {
const __m512i bytes_a = *p++;
const __m512i bytes_b = *p++;
__m512i v_sad_a, v_sad_b;

v_s2_a = _mm512_dpbusd_epi32(v_s2_a, bytes_a, mults_a);
v_s2_b = _mm512_dpbusd_epi32(v_s2_b, bytes_b, mults_a);
v_sad_a = _mm512_sad_epu8(bytes_a, zeroes);
v_sad_b = _mm512_sad_epu8(bytes_b, zeroes);
v_s1_sums_a = _mm512_add_epi32(v_s1_sums_a, v_s1);
v_s1 = _mm512_add_epi32(v_s1, v_sad_a);
v_s1_sums_b = _mm512_add_epi32(v_s1_sums_b, v_s1);
v_s1 = _mm512_add_epi32(v_s1, v_sad_b);
} while (p != end);

v_s1_sums_a = _mm512_add_epi32(v_s1_sums_a, v_s1_sums_b);
v_s1_sums_a = _mm512_slli_epi32(v_s1_sums_a, 6);
v_s2_a = _mm512_add_epi32(v_s2_a, v_s2_b);
v_s2_a = _mm512_add_epi32(v_s2_a, v_s1_sums_a);
ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2_a);
}
# include "../adler32_vec_template.h"
#endif /* HAVE_AVX512VNNI_INTRIN && HAVE_AVX512BW_INTRIN */

static inline adler32_func_t
arch_select_adler32_func(void)
{
const u32 features MAYBE_UNUSED = get_x86_cpu_features();

#ifdef adler32_avx512vnni_avx512bw
if ((features & X86_CPU_FEATURE_ZMM) &&
HAVE_AVX512VNNI(features) && HAVE_AVX512BW(features))
return adler32_avx512vnni_avx512bw;
#endif
#ifdef adler32_avx2
if (HAVE_AVX2(features))
return adler32_avx2;
Expand All @@ -282,6 +374,5 @@ arch_select_adler32_func(void)
return NULL;
}
#define arch_select_adler32_func arch_select_adler32_func
#endif

#endif /* LIB_X86_ADLER32_IMPL_H */
13 changes: 11 additions & 2 deletions lib/x86/cpu_features.c
Original file line number Diff line number Diff line change
Expand Up @@ -90,9 +90,12 @@ static const struct cpu_feature x86_cpu_feature_table[] = {
{X86_CPU_FEATURE_AVX, "avx"},
{X86_CPU_FEATURE_AVX2, "avx2"},
{X86_CPU_FEATURE_BMI2, "bmi2"},
{X86_CPU_FEATURE_ZMM, "zmm"},
{X86_CPU_FEATURE_AVX512F, "avx512f"},
{X86_CPU_FEATURE_AVX512BW, "avx512bw"},
{X86_CPU_FEATURE_AVX512VL, "avx512vl"},
{X86_CPU_FEATURE_VPCLMULQDQ, "vpclmulqdq"},
{X86_CPU_FEATURE_AVX512VNNI, "avx512vnni"},
};

volatile u32 libdeflate_x86_cpu_features = 0;
Expand Down Expand Up @@ -165,13 +168,19 @@ void libdeflate_init_x86_cpu_features(void)
features |= X86_CPU_FEATURE_AVX2;
if (b & (1 << 8))
features |= X86_CPU_FEATURE_BMI2;
if ((b & (1 << 16)) && ((xcr0 & 0xe6) == 0xe6) &&
if (((xcr0 & 0xe6) == 0xe6) &&
allow_512bit_vectors(manufacturer, family, model))
features |= X86_CPU_FEATURE_ZMM;
if ((b & (1 << 16)) && ((xcr0 & 0xe6) == 0xe6))
features |= X86_CPU_FEATURE_AVX512F;
if ((b & (1U << 31)) && ((xcr0 & 0xa6) == 0xa6))
if ((b & (1 << 30)) && ((xcr0 & 0xe6) == 0xe6))
features |= X86_CPU_FEATURE_AVX512BW;
if ((b & (1U << 31)) && ((xcr0 & 0xe6) == 0xe6))
features |= X86_CPU_FEATURE_AVX512VL;
if ((c & (1 << 10)) && ((xcr0 & 0x6) == 0x6))
features |= X86_CPU_FEATURE_VPCLMULQDQ;
if ((c & (1 << 11)) && ((xcr0 & 0xe6) == 0xe6))
features |= X86_CPU_FEATURE_AVX512VNNI;

out:
disable_cpu_features_for_testing(&features, x86_cpu_feature_table,
Expand Down
47 changes: 42 additions & 5 deletions lib/x86/cpu_features.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,18 +44,29 @@
#define X86_CPU_FEATURE_AVX 0x00000004
#define X86_CPU_FEATURE_AVX2 0x00000008
#define X86_CPU_FEATURE_BMI2 0x00000010
#define X86_CPU_FEATURE_AVX512F 0x00000020
#define X86_CPU_FEATURE_AVX512VL 0x00000040
#define X86_CPU_FEATURE_VPCLMULQDQ 0x00000080
/*
* ZMM indicates whether 512-bit vectors (zmm registers) should be used. On
* some CPUs, to avoid downclocking issues we don't set ZMM even if the CPU
* supports it, i.e. even if AVX512F is set. On these CPUs, we may still use
* AVX-512 instructions, but only with ymm and xmm registers.
*/
#define X86_CPU_FEATURE_ZMM 0x00000020
#define X86_CPU_FEATURE_AVX512F 0x00000040
#define X86_CPU_FEATURE_AVX512BW 0x00000080
#define X86_CPU_FEATURE_AVX512VL 0x00000100
#define X86_CPU_FEATURE_VPCLMULQDQ 0x00000200
#define X86_CPU_FEATURE_AVX512VNNI 0x00000400

#define HAVE_SSE2(features) (HAVE_SSE2_NATIVE || ((features) & X86_CPU_FEATURE_SSE2))
#define HAVE_PCLMULQDQ(features) (HAVE_PCLMULQDQ_NATIVE || ((features) & X86_CPU_FEATURE_PCLMULQDQ))
#define HAVE_AVX(features) (HAVE_AVX_NATIVE || ((features) & X86_CPU_FEATURE_AVX))
#define HAVE_AVX2(features) (HAVE_AVX2_NATIVE || ((features) & X86_CPU_FEATURE_AVX2))
#define HAVE_BMI2(features) (HAVE_BMI2_NATIVE || ((features) & X86_CPU_FEATURE_BMI2))
#define HAVE_AVX512F(features) (HAVE_AVX512F_NATIVE || ((features) & X86_CPU_FEATURE_AVX512F))
#define HAVE_AVX512BW(features) (HAVE_AVX512BW_NATIVE || ((features) & X86_CPU_FEATURE_AVX512BW))
#define HAVE_AVX512VL(features) (HAVE_AVX512VL_NATIVE || ((features) & X86_CPU_FEATURE_AVX512VL))
#define HAVE_VPCLMULQDQ(features) (HAVE_VPCLMULQDQ_NATIVE || ((features) & X86_CPU_FEATURE_VPCLMULQDQ))
#define HAVE_AVX512VNNI(features) (HAVE_AVX512VNNI_NATIVE || ((features) & X86_CPU_FEATURE_AVX512VNNI))

#if HAVE_DYNAMIC_X86_CPU_FEATURES
#define X86_CPU_FEATURES_KNOWN 0x80000000
Expand Down Expand Up @@ -175,6 +186,19 @@ static inline u32 get_x86_cpu_features(void) { return 0; }
# define HAVE_AVX512F_INTRIN 0
#endif

/* AVX-512BW */
#ifdef __AVX512BW__
# define HAVE_AVX512BW_NATIVE 1
#else
# define HAVE_AVX512BW_NATIVE 0
#endif
#if HAVE_AVX512BW_NATIVE || GCC_PREREQ(5, 1) || CLANG_PREREQ(3, 9, 0) || \
defined(_MSC_VER)
# define HAVE_AVX512BW_INTRIN 1
#else
# define HAVE_AVX512BW_INTRIN 0
#endif

/* AVX-512VL */
#ifdef __AVX512VL__
# define HAVE_AVX512VL_NATIVE 1
Expand All @@ -194,13 +218,26 @@ static inline u32 get_x86_cpu_features(void) { return 0; }
#else
# define HAVE_VPCLMULQDQ_NATIVE 0
#endif
#if HAVE_VPCLMULQDQ_NATIVE || (GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 0) || \
defined(_MSC_VER))
#if HAVE_VPCLMULQDQ_NATIVE || GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 0) || \
defined(_MSC_VER)
# define HAVE_VPCLMULQDQ_INTRIN 1
#else
# define HAVE_VPCLMULQDQ_INTRIN 0
#endif

/* AVX512VNNI */
#ifdef __AVX512VNNI__
# define HAVE_AVX512VNNI_NATIVE 1
#else
# define HAVE_AVX512VNNI_NATIVE 0
#endif
#if HAVE_AVX512VNNI_NATIVE || GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 0) || \
defined(_MSC_VER)
# define HAVE_AVX512VNNI_INTRIN 1
#else
# define HAVE_AVX512VNNI_INTRIN 0
#endif

#endif /* ARCH_X86_32 || ARCH_X86_64 */

#endif /* LIB_X86_CPU_FEATURES_H */
39 changes: 21 additions & 18 deletions lib/x86/crc32_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,14 +95,16 @@
#endif

/*
* VPCLMULQDQ/AVX512VL implementation. This takes advantage of some AVX-512
* instructions but uses 256-bit vectors rather than 512-bit. This can be
* useful on CPUs where 512-bit vectors cause downclocking.
* VPCLMULQDQ/AVX512 implementation with 256-bit vectors. This takes advantage
* of some AVX-512 instructions but uses 256-bit vectors rather than 512-bit.
* This can be useful on CPUs where 512-bit vectors cause downclocking.
*/
#if HAVE_VPCLMULQDQ_INTRIN && HAVE_PCLMULQDQ_INTRIN && HAVE_AVX512VL_INTRIN
# define crc32_x86_vpclmulqdq_avx512vl crc32_x86_vpclmulqdq_avx512vl
# define SUFFIX _vpclmulqdq_avx512vl
# if HAVE_VPCLMULQDQ_NATIVE && HAVE_PCLMULQDQ_NATIVE && HAVE_AVX512VL_NATIVE
#if HAVE_VPCLMULQDQ_INTRIN && HAVE_PCLMULQDQ_INTRIN && \
HAVE_AVX512F_INTRIN && HAVE_AVX512VL_INTRIN
# define crc32_x86_vpclmulqdq_avx512_vl256 crc32_x86_vpclmulqdq_avx512_vl256
# define SUFFIX _vpclmulqdq_avx512_vl256
# if HAVE_VPCLMULQDQ_NATIVE && HAVE_PCLMULQDQ_NATIVE && \
HAVE_AVX512F_NATIVE && HAVE_AVX512VL_NATIVE
# define ATTRIBUTES
# else
# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512vl")
Expand All @@ -113,16 +115,16 @@
# include "crc32_pclmul_template.h"
#endif

/* VPCLMULQDQ/AVX512F/AVX512VL implementation. Uses 512-bit vectors. */
/* VPCLMULQDQ/AVX512 implementation with 512-bit vectors */
#if HAVE_VPCLMULQDQ_INTRIN && HAVE_PCLMULQDQ_INTRIN && \
HAVE_AVX512F_INTRIN && HAVE_AVX512VL_INTRIN
# define crc32_x86_vpclmulqdq_avx512f_avx512vl crc32_x86_vpclmulqdq_avx512f_avx512vl
# define SUFFIX _vpclmulqdq_avx512f_avx512vl
#if HAVE_VPCLMULQDQ_NATIVE && HAVE_PCLMULQDQ_NATIVE && \
# define crc32_x86_vpclmulqdq_avx512_vl512 crc32_x86_vpclmulqdq_avx512_vl512
# define SUFFIX _vpclmulqdq_avx512_vl512
# if HAVE_VPCLMULQDQ_NATIVE && HAVE_PCLMULQDQ_NATIVE && \
HAVE_AVX512F_NATIVE && HAVE_AVX512VL_NATIVE
# define ATTRIBUTES
# else
# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512f,avx512vl")
# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512vl")
# endif
# define VL 64
# define FOLD_LESSTHAN16BYTES 1
Expand All @@ -136,15 +138,16 @@ arch_select_crc32_func(void)
{
const u32 features MAYBE_UNUSED = get_x86_cpu_features();

#ifdef crc32_x86_vpclmulqdq_avx512f_avx512vl
if (HAVE_VPCLMULQDQ(features) && HAVE_PCLMULQDQ(features) &&
#ifdef crc32_x86_vpclmulqdq_avx512_vl512
if ((features & X86_CPU_FEATURE_ZMM) &&
HAVE_VPCLMULQDQ(features) && HAVE_PCLMULQDQ(features) &&
HAVE_AVX512F(features) && HAVE_AVX512VL(features))
return crc32_x86_vpclmulqdq_avx512f_avx512vl;
return crc32_x86_vpclmulqdq_avx512_vl512;
#endif
#ifdef crc32_x86_vpclmulqdq_avx512vl
#ifdef crc32_x86_vpclmulqdq_avx512_vl256
if (HAVE_VPCLMULQDQ(features) && HAVE_PCLMULQDQ(features) &&
HAVE_AVX512VL(features))
return crc32_x86_vpclmulqdq_avx512vl;
HAVE_AVX512F(features) && HAVE_AVX512VL(features))
return crc32_x86_vpclmulqdq_avx512_vl256;
#endif
#ifdef crc32_x86_vpclmulqdq_avx2
if (HAVE_VPCLMULQDQ(features) && HAVE_PCLMULQDQ(features) &&
Expand Down
2 changes: 1 addition & 1 deletion lib/x86/crc32_pclmul_template.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
* VL=16 && FOLD_LESSTHAN16BYTES=1: at least pclmul,sse4.1
* VL=32 && USE_TERNARYLOGIC=0: at least vpclmulqdq,pclmul,avx2
* VL=32 && USE_TERNARYLOGIC=1: at least vpclmulqdq,pclmul,avx512vl
* VL=64: at least vpclmulqdq,pclmul,avx512f,avx512vl
* VL=64: at least vpclmulqdq,pclmul,avx512vl
* VL:
* Vector length in bytes. Supported values are 16, 32, and 64.
* FOLD_LESSTHAN16BYTES:
Expand Down
Loading
Loading