From 2929aea4ed833da7993d04a7abcccc95422dc276 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 23 Feb 2024 20:48:14 -0800 Subject: [PATCH 1/4] ci.yml: ensure MSVC can find zlib --- .github/workflows/ci.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f738eed4..e158e7db 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -126,8 +126,9 @@ jobs: - run: > cmake -B build -G "${{matrix.gen}}" -T ${{matrix.toolset}} -A ${{matrix.vs}} -DLIBDEFLATE_BUILD_TESTS=1 - -DCMAKE_C_FLAGS="/W4 /WX /DLIBDEFLATE_ENABLE_ASSERTIONS /IC:\vcpkg\packages\zlib_${{matrix.vcpkg}}\include" + -DCMAKE_C_FLAGS="/W4 /WX /DLIBDEFLATE_ENABLE_ASSERTIONS" -DZLIB_LIBRARY=C:\vcpkg\packages\zlib_${{matrix.vcpkg}}\lib\zlib.lib + -DZLIB_INCLUDE_DIR=C:\vcpkg\packages\zlib_${{matrix.vcpkg}}\include -DCMAKE_INSTALL_PREFIX=build\install - run: cmake --build build --verbose --config Debug - run: cmake --install build --verbose --config Debug From ea028f147da84918a383418ac3d80bc10748e72b Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 23 Feb 2024 20:48:14 -0800 Subject: [PATCH 2/4] lib/x86: disambiguate 512-bit vector from AVX-512F crc32_x86_vpclmulqdq_avx512vl and crc32_x86_vpclmulqdq_avx512f_avx512vl actually use the same CPU features, considering that vpternlog always requires at least avx512f, and compilers consider avx512vl to imply avx512f. Rename them to *_avx512_vl256 and *_avx512_vl512 to reflect that they differ only in vector length, and fix the CPU feature checking to use a separate flag for whether 512-bit vectors are enabled. --- lib/x86/cpu_features.c | 5 ++++- lib/x86/cpu_features.h | 13 ++++++++--- lib/x86/crc32_impl.h | 39 ++++++++++++++++++--------------- lib/x86/crc32_pclmul_template.h | 2 +- scripts/checksum_benchmarks.sh | 22 +++++++++---------- scripts/run_tests.sh | 2 +- 6 files changed, 47 insertions(+), 36 deletions(-) diff --git a/lib/x86/cpu_features.c b/lib/x86/cpu_features.c index 27564c56..72bd37c2 100644 --- a/lib/x86/cpu_features.c +++ b/lib/x86/cpu_features.c @@ -90,6 +90,7 @@ static const struct cpu_feature x86_cpu_feature_table[] = { {X86_CPU_FEATURE_AVX, "avx"}, {X86_CPU_FEATURE_AVX2, "avx2"}, {X86_CPU_FEATURE_BMI2, "bmi2"}, + {X86_CPU_FEATURE_ZMM, "zmm"}, {X86_CPU_FEATURE_AVX512F, "avx512f"}, {X86_CPU_FEATURE_AVX512VL, "avx512vl"}, {X86_CPU_FEATURE_VPCLMULQDQ, "vpclmulqdq"}, @@ -165,8 +166,10 @@ void libdeflate_init_x86_cpu_features(void) features |= X86_CPU_FEATURE_AVX2; if (b & (1 << 8)) features |= X86_CPU_FEATURE_BMI2; - if ((b & (1 << 16)) && ((xcr0 & 0xe6) == 0xe6) && + if (((xcr0 & 0xe6) == 0xe6) && allow_512bit_vectors(manufacturer, family, model)) + features |= X86_CPU_FEATURE_ZMM; + if ((b & (1 << 16)) && ((xcr0 & 0xe6) == 0xe6)) features |= X86_CPU_FEATURE_AVX512F; if ((b & (1U << 31)) && ((xcr0 & 0xa6) == 0xa6)) features |= X86_CPU_FEATURE_AVX512VL; diff --git a/lib/x86/cpu_features.h b/lib/x86/cpu_features.h index b5fbf573..6f68e032 100644 --- a/lib/x86/cpu_features.h +++ b/lib/x86/cpu_features.h @@ -44,9 +44,16 @@ #define X86_CPU_FEATURE_AVX 0x00000004 #define X86_CPU_FEATURE_AVX2 0x00000008 #define X86_CPU_FEATURE_BMI2 0x00000010 -#define X86_CPU_FEATURE_AVX512F 0x00000020 -#define X86_CPU_FEATURE_AVX512VL 0x00000040 -#define X86_CPU_FEATURE_VPCLMULQDQ 0x00000080 +/* + * ZMM indicates whether 512-bit vectors (zmm registers) should be used. On + * some CPUs, to avoid downclocking issues we don't set ZMM even if the CPU + * supports it, i.e. even if AVX512F is set. On these CPUs, we may still use + * AVX-512 instructions, but only with ymm and xmm registers. + */ +#define X86_CPU_FEATURE_ZMM 0x00000020 +#define X86_CPU_FEATURE_AVX512F 0x00000040 +#define X86_CPU_FEATURE_AVX512VL 0x00000080 +#define X86_CPU_FEATURE_VPCLMULQDQ 0x00000100 #define HAVE_SSE2(features) (HAVE_SSE2_NATIVE || ((features) & X86_CPU_FEATURE_SSE2)) #define HAVE_PCLMULQDQ(features) (HAVE_PCLMULQDQ_NATIVE || ((features) & X86_CPU_FEATURE_PCLMULQDQ)) diff --git a/lib/x86/crc32_impl.h b/lib/x86/crc32_impl.h index e818d0a6..d50547bd 100644 --- a/lib/x86/crc32_impl.h +++ b/lib/x86/crc32_impl.h @@ -95,14 +95,16 @@ #endif /* - * VPCLMULQDQ/AVX512VL implementation. This takes advantage of some AVX-512 - * instructions but uses 256-bit vectors rather than 512-bit. This can be - * useful on CPUs where 512-bit vectors cause downclocking. + * VPCLMULQDQ/AVX512 implementation with 256-bit vectors. This takes advantage + * of some AVX-512 instructions but uses 256-bit vectors rather than 512-bit. + * This can be useful on CPUs where 512-bit vectors cause downclocking. */ -#if HAVE_VPCLMULQDQ_INTRIN && HAVE_PCLMULQDQ_INTRIN && HAVE_AVX512VL_INTRIN -# define crc32_x86_vpclmulqdq_avx512vl crc32_x86_vpclmulqdq_avx512vl -# define SUFFIX _vpclmulqdq_avx512vl -# if HAVE_VPCLMULQDQ_NATIVE && HAVE_PCLMULQDQ_NATIVE && HAVE_AVX512VL_NATIVE +#if HAVE_VPCLMULQDQ_INTRIN && HAVE_PCLMULQDQ_INTRIN && \ + HAVE_AVX512F_INTRIN && HAVE_AVX512VL_INTRIN +# define crc32_x86_vpclmulqdq_avx512_vl256 crc32_x86_vpclmulqdq_avx512_vl256 +# define SUFFIX _vpclmulqdq_avx512_vl256 +# if HAVE_VPCLMULQDQ_NATIVE && HAVE_PCLMULQDQ_NATIVE && \ + HAVE_AVX512F_NATIVE && HAVE_AVX512VL_NATIVE # define ATTRIBUTES # else # define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512vl") @@ -113,16 +115,16 @@ # include "crc32_pclmul_template.h" #endif -/* VPCLMULQDQ/AVX512F/AVX512VL implementation. Uses 512-bit vectors. */ +/* VPCLMULQDQ/AVX512 implementation with 512-bit vectors */ #if HAVE_VPCLMULQDQ_INTRIN && HAVE_PCLMULQDQ_INTRIN && \ HAVE_AVX512F_INTRIN && HAVE_AVX512VL_INTRIN -# define crc32_x86_vpclmulqdq_avx512f_avx512vl crc32_x86_vpclmulqdq_avx512f_avx512vl -# define SUFFIX _vpclmulqdq_avx512f_avx512vl -#if HAVE_VPCLMULQDQ_NATIVE && HAVE_PCLMULQDQ_NATIVE && \ +# define crc32_x86_vpclmulqdq_avx512_vl512 crc32_x86_vpclmulqdq_avx512_vl512 +# define SUFFIX _vpclmulqdq_avx512_vl512 +# if HAVE_VPCLMULQDQ_NATIVE && HAVE_PCLMULQDQ_NATIVE && \ HAVE_AVX512F_NATIVE && HAVE_AVX512VL_NATIVE # define ATTRIBUTES # else -# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512f,avx512vl") +# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512vl") # endif # define VL 64 # define FOLD_LESSTHAN16BYTES 1 @@ -136,15 +138,16 @@ arch_select_crc32_func(void) { const u32 features MAYBE_UNUSED = get_x86_cpu_features(); -#ifdef crc32_x86_vpclmulqdq_avx512f_avx512vl - if (HAVE_VPCLMULQDQ(features) && HAVE_PCLMULQDQ(features) && +#ifdef crc32_x86_vpclmulqdq_avx512_vl512 + if ((features & X86_CPU_FEATURE_ZMM) && + HAVE_VPCLMULQDQ(features) && HAVE_PCLMULQDQ(features) && HAVE_AVX512F(features) && HAVE_AVX512VL(features)) - return crc32_x86_vpclmulqdq_avx512f_avx512vl; + return crc32_x86_vpclmulqdq_avx512_vl512; #endif -#ifdef crc32_x86_vpclmulqdq_avx512vl +#ifdef crc32_x86_vpclmulqdq_avx512_vl256 if (HAVE_VPCLMULQDQ(features) && HAVE_PCLMULQDQ(features) && - HAVE_AVX512VL(features)) - return crc32_x86_vpclmulqdq_avx512vl; + HAVE_AVX512F(features) && HAVE_AVX512VL(features)) + return crc32_x86_vpclmulqdq_avx512_vl256; #endif #ifdef crc32_x86_vpclmulqdq_avx2 if (HAVE_VPCLMULQDQ(features) && HAVE_PCLMULQDQ(features) && diff --git a/lib/x86/crc32_pclmul_template.h b/lib/x86/crc32_pclmul_template.h index 16e4ebf5..eb06262b 100644 --- a/lib/x86/crc32_pclmul_template.h +++ b/lib/x86/crc32_pclmul_template.h @@ -38,7 +38,7 @@ * VL=16 && FOLD_LESSTHAN16BYTES=1: at least pclmul,sse4.1 * VL=32 && USE_TERNARYLOGIC=0: at least vpclmulqdq,pclmul,avx2 * VL=32 && USE_TERNARYLOGIC=1: at least vpclmulqdq,pclmul,avx512vl - * VL=64: at least vpclmulqdq,pclmul,avx512f,avx512vl + * VL=64: at least vpclmulqdq,pclmul,avx512vl * VL: * Vector length in bytes. Supported values are 16, 32, and 64. * FOLD_LESSTHAN16BYTES: diff --git a/scripts/checksum_benchmarks.sh b/scripts/checksum_benchmarks.sh index b359fba3..b9c38698 100755 --- a/scripts/checksum_benchmarks.sh +++ b/scripts/checksum_benchmarks.sh @@ -67,12 +67,11 @@ sort_by_speed() { } disable_cpu_feature() { - local name="$1" + LIBDEFLATE_DISABLE_CPU_FEATURES+=",$1" shift - local extra_cflags=("$@") - - LIBDEFLATE_DISABLE_CPU_FEATURES+=",$name" - EXTRA_CFLAGS+=("${extra_cflags[@]}") + if (( $# > 0 )); then + EXTRA_CFLAGS+=("$@") + fi } cleanup() { @@ -114,15 +113,14 @@ export LIBDEFLATE_DISABLE_CPU_FEATURES="" { case $ARCH in i386|x86_64) - if have_cpu_features vpclmulqdq avx512f avx512vl; then - do_benchmark "VPCLMULQDQ/AVX512F/AVX512VL" - disable_cpu_feature "avx512f" "-mno-avx512f" - fi - if have_cpu_features vpclmulqdq avx512vl; then - do_benchmark "VPCLMULQDQ/AVX512VL" + if have_cpu_features vpclmulqdq pclmulqdq avx512f avx512vl; then + do_benchmark "VPCLMULQDQ/AVX512/VL512" + disable_cpu_feature "zmm" + do_benchmark "VPCLMULQDQ/AVX512/VL256" disable_cpu_feature "avx512vl" "-mno-avx512vl" + disable_cpu_feature "avx512f" "-mno-avx512f" fi - if have_cpu_features vpclmulqdq avx2; then + if have_cpu_features vpclmulqdq pclmulqdq avx2; then do_benchmark "VPCLMULQDQ/AVX2" disable_cpu_feature "vpclmulqdq" "-mno-vpclmulqdq" fi diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh index ff15cc19..b07f71a7 100755 --- a/scripts/run_tests.sh +++ b/scripts/run_tests.sh @@ -142,7 +142,7 @@ build_and_run_tests() if ! [[ "$CFLAGS" =~ "-march=native" ]] && ! $quick; then case "$ARCH" in i386|x86_64) - features+=(vpclmulqdq avx512vl avx512f + features+=(zmm vpclmulqdq avx512vl avx512f avx2 avx bmi2 pclmulqdq sse2) ;; arm*|aarch*) From 5b217a14d6802ffe5cd586dd8aa5cc44610eb633 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 23 Feb 2024 20:48:14 -0800 Subject: [PATCH 3/4] lib/x86: fix XCR0 check for AVX-512VL According to the Intel manual, the ZMM_Hi256 bit needs to be checked for all AVX-512 instructions, even if 512-bit vectors aren't being used. --- lib/x86/cpu_features.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/x86/cpu_features.c b/lib/x86/cpu_features.c index 72bd37c2..ba7a39ff 100644 --- a/lib/x86/cpu_features.c +++ b/lib/x86/cpu_features.c @@ -171,7 +171,7 @@ void libdeflate_init_x86_cpu_features(void) features |= X86_CPU_FEATURE_ZMM; if ((b & (1 << 16)) && ((xcr0 & 0xe6) == 0xe6)) features |= X86_CPU_FEATURE_AVX512F; - if ((b & (1U << 31)) && ((xcr0 & 0xa6) == 0xa6)) + if ((b & (1U << 31)) && ((xcr0 & 0xe6) == 0xe6)) features |= X86_CPU_FEATURE_AVX512VL; if ((c & (1 << 10)) && ((xcr0 & 0x6) == 0x6)) features |= X86_CPU_FEATURE_VPCLMULQDQ; From a026a046e05f4fdd1a27f880f8344a97c2ed5571 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 23 Feb 2024 20:48:14 -0800 Subject: [PATCH 4/4] lib/x86/adler32: add an AVX-512 implementation libdeflate used to (before commit 416bac37a0c4) have an AVX512BW implementation of Adler-32, but I removed it due to AVX-512's downclocking issues. Since then, newer Intel and AMD CPUs have come out with better AVX-512 implementations, and these CPUs tend to have AVX512VNNI which includes a dot product instruction which is useful for Adler-32. Therefore, add an AVX512VNNI/AVX512BW implementation. --- lib/x86/adler32_impl.h | 141 +++++++++++++++++++++++++++------ lib/x86/cpu_features.c | 6 ++ lib/x86/cpu_features.h | 38 ++++++++- scripts/checksum_benchmarks.sh | 5 ++ scripts/run_tests.sh | 4 +- 5 files changed, 163 insertions(+), 31 deletions(-) diff --git a/lib/x86/adler32_impl.h b/lib/x86/adler32_impl.h index 6285dc80..eb03083e 100644 --- a/lib/x86/adler32_impl.h +++ b/lib/x86/adler32_impl.h @@ -67,6 +67,19 @@ ADLER32_FINISH_VEC_CHUNK_128((s1), (s2), s1_128bit, s2_128bit); \ } +#define ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2) \ +{ \ + __m256i /* __v8su */ s1_256bit, s2_256bit; \ + \ + /* 512 => 256 bits */ \ + s1_256bit = _mm256_add_epi32(_mm512_extracti64x4_epi64((v_s1), 0), \ + _mm512_extracti64x4_epi64((v_s1), 1)); \ + s2_256bit = _mm256_add_epi32(_mm512_extracti64x4_epi64((v_s2), 0), \ + _mm512_extracti64x4_epi64((v_s2), 1)); \ + \ + ADLER32_FINISH_VEC_CHUNK_256((s1), (s2), s1_256bit, s2_256bit); \ +} + /* * This is a very silly partial workaround for gcc bug * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107892. The bug causes gcc to @@ -105,15 +118,17 @@ static forceinline ATTRIBUTES void adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2) { + static const u16 _aligned_attribute(16) mults[4][16] = { + { 32, 31, 30, 29, 28, 27, 26, 25 }, + { 24, 23, 22, 21, 20, 19, 18, 17 }, + { 16, 15, 14, 13, 12, 11, 10, 9 }, + { 8, 7, 6, 5, 4, 3, 2, 1 }, + }; + const __m128i /* __v8hu */ mults_a = _mm_load_si128((const __m128i *)mults[0]); + const __m128i /* __v8hu */ mults_b = _mm_load_si128((const __m128i *)mults[1]); + const __m128i /* __v8hu */ mults_c = _mm_load_si128((const __m128i *)mults[2]); + const __m128i /* __v8hu */ mults_d = _mm_load_si128((const __m128i *)mults[3]); const __m128i zeroes = _mm_setzero_si128(); - const __m128i /* __v8hu */ mults_a = - _mm_setr_epi16(32, 31, 30, 29, 28, 27, 26, 25); - const __m128i /* __v8hu */ mults_b = - _mm_setr_epi16(24, 23, 22, 21, 20, 19, 18, 17); - const __m128i /* __v8hu */ mults_c = - _mm_setr_epi16(16, 15, 14, 13, 12, 11, 10, 9); - const __m128i /* __v8hu */ mults_d = - _mm_setr_epi16(8, 7, 6, 5, 4, 3, 2, 1); /* s1 counters: 32-bit, sum of bytes */ __m128i /* __v4su */ v_s1 = zeroes; @@ -209,23 +224,21 @@ adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2) static forceinline ATTRIBUTES void adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2) { - const __m256i zeroes = _mm256_setzero_si256(); /* * Note, the multipliers have to be in this order because * _mm256_unpack{lo,hi}_epi8 work on each 128-bit lane separately. */ - const __m256i /* __v16hu */ mults_a = - _mm256_setr_epi16(64, 63, 62, 61, 60, 59, 58, 57, - 48, 47, 46, 45, 44, 43, 42, 41); - const __m256i /* __v16hu */ mults_b = - _mm256_setr_epi16(56, 55, 54, 53, 52, 51, 50, 49, - 40, 39, 38, 37, 36, 35, 34, 33); - const __m256i /* __v16hu */ mults_c = - _mm256_setr_epi16(32, 31, 30, 29, 28, 27, 26, 25, - 16, 15, 14, 13, 12, 11, 10, 9); - const __m256i /* __v16hu */ mults_d = - _mm256_setr_epi16(24, 23, 22, 21, 20, 19, 18, 17, - 8, 7, 6, 5, 4, 3, 2, 1); + static const u16 _aligned_attribute(32) mults[4][16] = { + { 64, 63, 62, 61, 60, 59, 58, 57, 48, 47, 46, 45, 44, 43, 42, 41 }, + { 56, 55, 54, 53, 52, 51, 50, 49, 40, 39, 38, 37, 36, 35, 34, 33 }, + { 32, 31, 30, 29, 28, 27, 26, 25, 16, 15, 14, 13, 12, 11, 10, 9 }, + { 24, 23, 22, 21, 20, 19, 18, 17, 8, 7, 6, 5, 4, 3, 2, 1 }, + }; + const __m256i /* __v16hu */ mults_a = _mm256_load_si256((const __m256i *)mults[0]); + const __m256i /* __v16hu */ mults_b = _mm256_load_si256((const __m256i *)mults[1]); + const __m256i /* __v16hu */ mults_c = _mm256_load_si256((const __m256i *)mults[2]); + const __m256i /* __v16hu */ mults_d = _mm256_load_si256((const __m256i *)mults[3]); + const __m256i zeroes = _mm256_setzero_si256(); __m256i /* __v8su */ v_s1 = zeroes; __m256i /* __v8su */ v_s2 = zeroes; __m256i /* __v16hu */ v_byte_sums_a = zeroes; @@ -263,14 +276,93 @@ adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2) # include "../adler32_vec_template.h" #endif /* HAVE_AVX2_INTRIN */ -#if defined(adler32_avx2) && HAVE_AVX2_NATIVE -#define DEFAULT_IMPL adler32_avx2 -#else +/* + * AVX512VNNI/AVX512BW implementation. Uses the dot product instruction + * vpdpbusd from AVX512VNNI and the vpsadbw instruction from AVX512BW. + * (vpdpbusd with ones could be used instead of vpsadbw, but it's slower.) + * + * We currently don't include an implementation using AVX512VNNI or AVX512BW + * alone, since CPUs with good AVX-512 implementations tend to have both. + */ +#if HAVE_AVX512VNNI_INTRIN && HAVE_AVX512BW_INTRIN +# define adler32_avx512vnni_avx512bw adler32_avx512vnni_avx512bw +# define FUNCNAME adler32_avx512vnni_avx512bw +# define FUNCNAME_CHUNK adler32_avx512vnni_avx512bw_chunk +# define IMPL_ALIGNMENT 64 +# define IMPL_SEGMENT_LEN 128 +# define IMPL_MAX_CHUNK_LEN (128 * (0xFFFF / 0xFF)) +# if HAVE_AVX512VNNI_NATIVE && HAVE_AVX512BW_NATIVE +# define ATTRIBUTES +# else +# define ATTRIBUTES _target_attribute("avx512vnni,avx512bw") +# endif +# include + /* + * With clang in MSVC compatibility mode, immintrin.h incorrectly skips + * including some sub-headers. + */ +# if defined(__clang__) && defined(_MSC_VER) +# include +# include +# include +# include +# include +# include +# include +# include +# endif +static forceinline ATTRIBUTES void +adler32_avx512vnni_avx512bw_chunk(const __m512i *p, const __m512i *const end, + u32 *s1, u32 *s2) +{ + static const u8 _aligned_attribute(64) mults[64] = { + 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, + 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, + 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, + 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, + }; + const __m512i /* __v64qu */ mults_a = _mm512_load_si512((const __m512i *)mults); + const __m512i zeroes = _mm512_setzero_si512(); + __m512i /* __v16su */ v_s1 = zeroes; + __m512i /* __v16su */ v_s1_sums_a = zeroes; + __m512i /* __v16su */ v_s1_sums_b = zeroes; + __m512i /* __v16su */ v_s2_a = zeroes; + __m512i /* __v16su */ v_s2_b = zeroes; + + do { + const __m512i bytes_a = *p++; + const __m512i bytes_b = *p++; + __m512i v_sad_a, v_sad_b; + + v_s2_a = _mm512_dpbusd_epi32(v_s2_a, bytes_a, mults_a); + v_s2_b = _mm512_dpbusd_epi32(v_s2_b, bytes_b, mults_a); + v_sad_a = _mm512_sad_epu8(bytes_a, zeroes); + v_sad_b = _mm512_sad_epu8(bytes_b, zeroes); + v_s1_sums_a = _mm512_add_epi32(v_s1_sums_a, v_s1); + v_s1 = _mm512_add_epi32(v_s1, v_sad_a); + v_s1_sums_b = _mm512_add_epi32(v_s1_sums_b, v_s1); + v_s1 = _mm512_add_epi32(v_s1, v_sad_b); + } while (p != end); + + v_s1_sums_a = _mm512_add_epi32(v_s1_sums_a, v_s1_sums_b); + v_s1_sums_a = _mm512_slli_epi32(v_s1_sums_a, 6); + v_s2_a = _mm512_add_epi32(v_s2_a, v_s2_b); + v_s2_a = _mm512_add_epi32(v_s2_a, v_s1_sums_a); + ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2_a); +} +# include "../adler32_vec_template.h" +#endif /* HAVE_AVX512VNNI_INTRIN && HAVE_AVX512BW_INTRIN */ + static inline adler32_func_t arch_select_adler32_func(void) { const u32 features MAYBE_UNUSED = get_x86_cpu_features(); +#ifdef adler32_avx512vnni_avx512bw + if ((features & X86_CPU_FEATURE_ZMM) && + HAVE_AVX512VNNI(features) && HAVE_AVX512BW(features)) + return adler32_avx512vnni_avx512bw; +#endif #ifdef adler32_avx2 if (HAVE_AVX2(features)) return adler32_avx2; @@ -282,6 +374,5 @@ arch_select_adler32_func(void) return NULL; } #define arch_select_adler32_func arch_select_adler32_func -#endif #endif /* LIB_X86_ADLER32_IMPL_H */ diff --git a/lib/x86/cpu_features.c b/lib/x86/cpu_features.c index ba7a39ff..750bf66c 100644 --- a/lib/x86/cpu_features.c +++ b/lib/x86/cpu_features.c @@ -92,8 +92,10 @@ static const struct cpu_feature x86_cpu_feature_table[] = { {X86_CPU_FEATURE_BMI2, "bmi2"}, {X86_CPU_FEATURE_ZMM, "zmm"}, {X86_CPU_FEATURE_AVX512F, "avx512f"}, + {X86_CPU_FEATURE_AVX512BW, "avx512bw"}, {X86_CPU_FEATURE_AVX512VL, "avx512vl"}, {X86_CPU_FEATURE_VPCLMULQDQ, "vpclmulqdq"}, + {X86_CPU_FEATURE_AVX512VNNI, "avx512vnni"}, }; volatile u32 libdeflate_x86_cpu_features = 0; @@ -171,10 +173,14 @@ void libdeflate_init_x86_cpu_features(void) features |= X86_CPU_FEATURE_ZMM; if ((b & (1 << 16)) && ((xcr0 & 0xe6) == 0xe6)) features |= X86_CPU_FEATURE_AVX512F; + if ((b & (1 << 30)) && ((xcr0 & 0xe6) == 0xe6)) + features |= X86_CPU_FEATURE_AVX512BW; if ((b & (1U << 31)) && ((xcr0 & 0xe6) == 0xe6)) features |= X86_CPU_FEATURE_AVX512VL; if ((c & (1 << 10)) && ((xcr0 & 0x6) == 0x6)) features |= X86_CPU_FEATURE_VPCLMULQDQ; + if ((c & (1 << 11)) && ((xcr0 & 0xe6) == 0xe6)) + features |= X86_CPU_FEATURE_AVX512VNNI; out: disable_cpu_features_for_testing(&features, x86_cpu_feature_table, diff --git a/lib/x86/cpu_features.h b/lib/x86/cpu_features.h index 6f68e032..10afd8dd 100644 --- a/lib/x86/cpu_features.h +++ b/lib/x86/cpu_features.h @@ -52,8 +52,10 @@ */ #define X86_CPU_FEATURE_ZMM 0x00000020 #define X86_CPU_FEATURE_AVX512F 0x00000040 -#define X86_CPU_FEATURE_AVX512VL 0x00000080 -#define X86_CPU_FEATURE_VPCLMULQDQ 0x00000100 +#define X86_CPU_FEATURE_AVX512BW 0x00000080 +#define X86_CPU_FEATURE_AVX512VL 0x00000100 +#define X86_CPU_FEATURE_VPCLMULQDQ 0x00000200 +#define X86_CPU_FEATURE_AVX512VNNI 0x00000400 #define HAVE_SSE2(features) (HAVE_SSE2_NATIVE || ((features) & X86_CPU_FEATURE_SSE2)) #define HAVE_PCLMULQDQ(features) (HAVE_PCLMULQDQ_NATIVE || ((features) & X86_CPU_FEATURE_PCLMULQDQ)) @@ -61,8 +63,10 @@ #define HAVE_AVX2(features) (HAVE_AVX2_NATIVE || ((features) & X86_CPU_FEATURE_AVX2)) #define HAVE_BMI2(features) (HAVE_BMI2_NATIVE || ((features) & X86_CPU_FEATURE_BMI2)) #define HAVE_AVX512F(features) (HAVE_AVX512F_NATIVE || ((features) & X86_CPU_FEATURE_AVX512F)) +#define HAVE_AVX512BW(features) (HAVE_AVX512BW_NATIVE || ((features) & X86_CPU_FEATURE_AVX512BW)) #define HAVE_AVX512VL(features) (HAVE_AVX512VL_NATIVE || ((features) & X86_CPU_FEATURE_AVX512VL)) #define HAVE_VPCLMULQDQ(features) (HAVE_VPCLMULQDQ_NATIVE || ((features) & X86_CPU_FEATURE_VPCLMULQDQ)) +#define HAVE_AVX512VNNI(features) (HAVE_AVX512VNNI_NATIVE || ((features) & X86_CPU_FEATURE_AVX512VNNI)) #if HAVE_DYNAMIC_X86_CPU_FEATURES #define X86_CPU_FEATURES_KNOWN 0x80000000 @@ -182,6 +186,19 @@ static inline u32 get_x86_cpu_features(void) { return 0; } # define HAVE_AVX512F_INTRIN 0 #endif +/* AVX-512BW */ +#ifdef __AVX512BW__ +# define HAVE_AVX512BW_NATIVE 1 +#else +# define HAVE_AVX512BW_NATIVE 0 +#endif +#if HAVE_AVX512BW_NATIVE || GCC_PREREQ(5, 1) || CLANG_PREREQ(3, 9, 0) || \ + defined(_MSC_VER) +# define HAVE_AVX512BW_INTRIN 1 +#else +# define HAVE_AVX512BW_INTRIN 0 +#endif + /* AVX-512VL */ #ifdef __AVX512VL__ # define HAVE_AVX512VL_NATIVE 1 @@ -201,13 +218,26 @@ static inline u32 get_x86_cpu_features(void) { return 0; } #else # define HAVE_VPCLMULQDQ_NATIVE 0 #endif -#if HAVE_VPCLMULQDQ_NATIVE || (GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 0) || \ - defined(_MSC_VER)) +#if HAVE_VPCLMULQDQ_NATIVE || GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 0) || \ + defined(_MSC_VER) # define HAVE_VPCLMULQDQ_INTRIN 1 #else # define HAVE_VPCLMULQDQ_INTRIN 0 #endif +/* AVX512VNNI */ +#ifdef __AVX512VNNI__ +# define HAVE_AVX512VNNI_NATIVE 1 +#else +# define HAVE_AVX512VNNI_NATIVE 0 +#endif +#if HAVE_AVX512VNNI_NATIVE || GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 0) || \ + defined(_MSC_VER) +# define HAVE_AVX512VNNI_INTRIN 1 +#else +# define HAVE_AVX512VNNI_INTRIN 0 +#endif + #endif /* ARCH_X86_32 || ARCH_X86_64 */ #endif /* LIB_X86_CPU_FEATURES_H */ diff --git a/scripts/checksum_benchmarks.sh b/scripts/checksum_benchmarks.sh index b9c38698..6b418190 100755 --- a/scripts/checksum_benchmarks.sh +++ b/scripts/checksum_benchmarks.sh @@ -157,6 +157,11 @@ echo { case $ARCH in i386|x86_64) + if have_cpu_features avx512_vnni avx512bw; then + do_benchmark "AVX512VNNI/AVX512BW" + disable_cpu_feature "avx512vnni" "-mno-avx512vnni" + disable_cpu_feature "avx512bw" "-mno-avx512bw" + fi if have_cpu_feature avx2; then do_benchmark "AVX2" disable_cpu_feature "avx2" "-mno-avx2" diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh index b07f71a7..9bca9677 100755 --- a/scripts/run_tests.sh +++ b/scripts/run_tests.sh @@ -142,8 +142,8 @@ build_and_run_tests() if ! [[ "$CFLAGS" =~ "-march=native" ]] && ! $quick; then case "$ARCH" in i386|x86_64) - features+=(zmm vpclmulqdq avx512vl avx512f - avx2 avx bmi2 pclmulqdq sse2) + features+=(zmm avx512vnni vpclmulqdq avx512vl avx512bw + avx512f avx2 avx bmi2 pclmulqdq sse2) ;; arm*|aarch*) features+=(dotprod sha3 crc32 pmull neon)