diff --git a/lib/x86/adler32_impl.h b/lib/x86/adler32_impl.h index eb03083e..a6abe651 100644 --- a/lib/x86/adler32_impl.h +++ b/lib/x86/adler32_impl.h @@ -277,24 +277,115 @@ adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2) #endif /* HAVE_AVX2_INTRIN */ /* - * AVX512VNNI/AVX512BW implementation. Uses the dot product instruction - * vpdpbusd from AVX512VNNI and the vpsadbw instruction from AVX512BW. - * (vpdpbusd with ones could be used instead of vpsadbw, but it's slower.) + * AVX2/AVX-VNNI implementation. This is similar to the AVX512BW/AVX512VNNI + * implementation, but instead of using AVX-512 it uses AVX2 plus AVX-VNNI. + * AVX-VNNI adds dot product instructions to CPUs without AVX-512. + */ +#if HAVE_AVX2_INTRIN && HAVE_AVXVNNI_INTRIN +# define adler32_avx2_vnni adler32_avx2_vnni +# define FUNCNAME adler32_avx2_vnni +# define FUNCNAME_CHUNK adler32_avx2_vnni_chunk +# define IMPL_ALIGNMENT 32 +# define IMPL_SEGMENT_LEN 128 +# define IMPL_MAX_CHUNK_LEN MAX_CHUNK_LEN +# if HAVE_AVX2_NATIVE && HAVE_AVXVNNI_NATIVE +# define ATTRIBUTES +# else +# define ATTRIBUTES _target_attribute("avx2,avxvnni") +# endif +# include + /* + * With clang in MSVC compatibility mode, immintrin.h incorrectly skips + * including some sub-headers. + */ +# if defined(__clang__) && defined(_MSC_VER) +# include +# include +# include +# include +# include +# include +# endif +static forceinline ATTRIBUTES void +adler32_avx2_vnni_chunk(const __m256i *p, const __m256i *const end, + u32 *s1, u32 *s2) +{ + static const u8 _aligned_attribute(32) mults[2][32] = { + { 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, + 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, }, + { 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, + 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, }, + }; + const __m256i /* __v32qu */ mults_a = _mm256_load_si256((const __m256i *)mults[0]); + const __m256i /* __v32qu */ mults_b = _mm256_load_si256((const __m256i *)mults[1]); + const __m256i zeroes = _mm256_setzero_si256(); + __m256i /* __v8su */ v_s1_a = zeroes; + __m256i /* __v8su */ v_s1_b = zeroes; + __m256i /* __v8su */ v_s1_sums_a = zeroes; + __m256i /* __v8su */ v_s1_sums_b = zeroes; + __m256i /* __v8su */ v_s2_a = zeroes; + __m256i /* __v8su */ v_s2_b = zeroes; + __m256i /* __v8su */ v_s2_c = zeroes; + __m256i /* __v8su */ v_s2_d = zeroes; + + do { + __m256i bytes_a = *p++; + __m256i bytes_b = *p++; + __m256i bytes_c = *p++; + __m256i bytes_d = *p++; + + v_s2_a = _mm256_dpbusd_avx_epi32(v_s2_a, bytes_a, mults_a); + v_s2_b = _mm256_dpbusd_avx_epi32(v_s2_b, bytes_b, mults_b); + v_s2_c = _mm256_dpbusd_avx_epi32(v_s2_c, bytes_c, mults_a); + v_s2_d = _mm256_dpbusd_avx_epi32(v_s2_d, bytes_d, mults_b); + v_s1_sums_a = _mm256_add_epi32(v_s1_sums_a, v_s1_a); + v_s1_sums_b = _mm256_add_epi32(v_s1_sums_b, v_s1_b); + v_s1_a = _mm256_add_epi32(v_s1_a, + _mm256_add_epi32(_mm256_sad_epu8(bytes_a, zeroes), + _mm256_sad_epu8(bytes_b, zeroes))); + v_s1_b = _mm256_add_epi32(v_s1_b, + _mm256_add_epi32(_mm256_sad_epu8(bytes_c, zeroes), + _mm256_sad_epu8(bytes_d, zeroes))); +#if GCC_PREREQ(1, 0) + __asm__("" : "+x" (v_s1_a), "+x" (v_s1_b), "+x" (v_s1_sums_a), + "+x" (v_s1_sums_b), "+x" (v_s2_a), "+x" (v_s2_b), + "+x" (v_s2_c), "+x" (v_s2_d)); +#endif + } while (p != end); + + v_s1_sums_a = _mm256_add_epi32(v_s1_sums_a, v_s1_sums_b); + v_s1_sums_a = _mm256_slli_epi32(v_s1_sums_a, 7); + v_s1_sums_a = _mm256_add_epi32(v_s1_sums_a, _mm256_slli_epi32(v_s1_a, 6)); + v_s1_a = _mm256_add_epi32(v_s1_a, v_s1_b); + v_s2_a = _mm256_add_epi32(v_s2_a, v_s2_b); + v_s2_c = _mm256_add_epi32(v_s2_c, v_s2_d); + v_s2_a = _mm256_add_epi32(v_s2_a, v_s2_c); + v_s2_a = _mm256_add_epi32(v_s2_a, v_s1_sums_a); + ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1_a, v_s2_a); +} +# include "../adler32_vec_template.h" +#endif /* HAVE_AVX2_INTRIN && HAVE_AVXVNNI_INTRIN */ + +/* + * AVX512BW/AVX512VNNI implementation. Uses the vpsadbw (sum of absolute + * differences) instruction from AVX512BW and the vpdpbusd (dot product) + * instruction from AVX512VNNI. Note that vpdpbusd with all-ones could be used + * instead of vpsadbw, but vpsadbw is faster. * - * We currently don't include an implementation using AVX512VNNI or AVX512BW + * We currently don't include an implementation using AVX512BW or AVX512VNNI * alone, since CPUs with good AVX-512 implementations tend to have both. */ -#if HAVE_AVX512VNNI_INTRIN && HAVE_AVX512BW_INTRIN -# define adler32_avx512vnni_avx512bw adler32_avx512vnni_avx512bw -# define FUNCNAME adler32_avx512vnni_avx512bw -# define FUNCNAME_CHUNK adler32_avx512vnni_avx512bw_chunk +#if HAVE_AVX512BW_INTRIN && HAVE_AVX512VNNI_INTRIN +# define adler32_avx512_vnni adler32_avx512_vnni +# define FUNCNAME adler32_avx512_vnni +# define FUNCNAME_CHUNK adler32_avx512_vnni_chunk # define IMPL_ALIGNMENT 64 # define IMPL_SEGMENT_LEN 128 -# define IMPL_MAX_CHUNK_LEN (128 * (0xFFFF / 0xFF)) -# if HAVE_AVX512VNNI_NATIVE && HAVE_AVX512BW_NATIVE +# define IMPL_MAX_CHUNK_LEN MAX_CHUNK_LEN +# if HAVE_AVX512BW_NATIVE && HAVE_AVX512VNNI_NATIVE # define ATTRIBUTES # else -# define ATTRIBUTES _target_attribute("avx512vnni,avx512bw") +# define ATTRIBUTES _target_attribute("avx512bw,avx512vnni") # endif # include /* @@ -312,8 +403,8 @@ adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2) # include # endif static forceinline ATTRIBUTES void -adler32_avx512vnni_avx512bw_chunk(const __m512i *p, const __m512i *const end, - u32 *s1, u32 *s2) +adler32_avx512_vnni_chunk(const __m512i *p, const __m512i *const end, + u32 *s1, u32 *s2) { static const u8 _aligned_attribute(64) mults[64] = { 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, @@ -323,7 +414,8 @@ adler32_avx512vnni_avx512bw_chunk(const __m512i *p, const __m512i *const end, }; const __m512i /* __v64qu */ mults_a = _mm512_load_si512((const __m512i *)mults); const __m512i zeroes = _mm512_setzero_si512(); - __m512i /* __v16su */ v_s1 = zeroes; + __m512i /* __v16su */ v_s1_a = zeroes; + __m512i /* __v16su */ v_s1_b = zeroes; __m512i /* __v16su */ v_s1_sums_a = zeroes; __m512i /* __v16su */ v_s1_sums_b = zeroes; __m512i /* __v16su */ v_s2_a = zeroes; @@ -332,36 +424,41 @@ adler32_avx512vnni_avx512bw_chunk(const __m512i *p, const __m512i *const end, do { const __m512i bytes_a = *p++; const __m512i bytes_b = *p++; - __m512i v_sad_a, v_sad_b; v_s2_a = _mm512_dpbusd_epi32(v_s2_a, bytes_a, mults_a); v_s2_b = _mm512_dpbusd_epi32(v_s2_b, bytes_b, mults_a); - v_sad_a = _mm512_sad_epu8(bytes_a, zeroes); - v_sad_b = _mm512_sad_epu8(bytes_b, zeroes); - v_s1_sums_a = _mm512_add_epi32(v_s1_sums_a, v_s1); - v_s1 = _mm512_add_epi32(v_s1, v_sad_a); - v_s1_sums_b = _mm512_add_epi32(v_s1_sums_b, v_s1); - v_s1 = _mm512_add_epi32(v_s1, v_sad_b); + v_s1_sums_a = _mm512_add_epi32(v_s1_sums_a, v_s1_a); + v_s1_sums_b = _mm512_add_epi32(v_s1_sums_b, v_s1_b); + v_s1_a = _mm512_add_epi32(v_s1_a, _mm512_sad_epu8(bytes_a, zeroes)); + v_s1_b = _mm512_add_epi32(v_s1_b, _mm512_sad_epu8(bytes_b, zeroes)); + GCC_UPDATE_VARS(v_s1_a, v_s1_b, v_s1_sums_a, v_s1_sums_b, + v_s2_a, v_s2_b); } while (p != end); v_s1_sums_a = _mm512_add_epi32(v_s1_sums_a, v_s1_sums_b); - v_s1_sums_a = _mm512_slli_epi32(v_s1_sums_a, 6); + v_s1_sums_a = _mm512_slli_epi32(v_s1_sums_a, 7); + v_s1_sums_a = _mm512_add_epi32(v_s1_sums_a, _mm512_slli_epi32(v_s1_a, 6)); + v_s1_a = _mm512_add_epi32(v_s1_a, v_s1_b); v_s2_a = _mm512_add_epi32(v_s2_a, v_s2_b); v_s2_a = _mm512_add_epi32(v_s2_a, v_s1_sums_a); - ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2_a); + ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1_a, v_s2_a); } # include "../adler32_vec_template.h" -#endif /* HAVE_AVX512VNNI_INTRIN && HAVE_AVX512BW_INTRIN */ +#endif /* HAVE_AVX512BW_INTRIN && HAVE_AVX512VNNI_INTRIN */ static inline adler32_func_t arch_select_adler32_func(void) { const u32 features MAYBE_UNUSED = get_x86_cpu_features(); -#ifdef adler32_avx512vnni_avx512bw +#ifdef adler32_avx512_vnni if ((features & X86_CPU_FEATURE_ZMM) && - HAVE_AVX512VNNI(features) && HAVE_AVX512BW(features)) - return adler32_avx512vnni_avx512bw; + HAVE_AVX512BW(features) && HAVE_AVX512VNNI(features)) + return adler32_avx512_vnni; +#endif +#ifdef adler32_avx2_vnni + if (HAVE_AVX2(features) && HAVE_AVXVNNI(features)) + return adler32_avx2_vnni; #endif #ifdef adler32_avx2 if (HAVE_AVX2(features)) diff --git a/lib/x86/cpu_features.c b/lib/x86/cpu_features.c index 750bf66c..227fd221 100644 --- a/lib/x86/cpu_features.c +++ b/lib/x86/cpu_features.c @@ -95,7 +95,8 @@ static const struct cpu_feature x86_cpu_feature_table[] = { {X86_CPU_FEATURE_AVX512BW, "avx512bw"}, {X86_CPU_FEATURE_AVX512VL, "avx512vl"}, {X86_CPU_FEATURE_VPCLMULQDQ, "vpclmulqdq"}, - {X86_CPU_FEATURE_AVX512VNNI, "avx512vnni"}, + {X86_CPU_FEATURE_AVX512VNNI, "avx512_vnni"}, + {X86_CPU_FEATURE_AVXVNNI, "avx_vnni"}, }; volatile u32 libdeflate_x86_cpu_features = 0; @@ -182,6 +183,11 @@ void libdeflate_init_x86_cpu_features(void) if ((c & (1 << 11)) && ((xcr0 & 0xe6) == 0xe6)) features |= X86_CPU_FEATURE_AVX512VNNI; + /* EAX=7, ECX=1: Extended Features */ + cpuid(7, 1, &a, &b, &c, &d); + if ((a & (1 << 4)) && ((xcr0 & 0x6) == 0x6)) + features |= X86_CPU_FEATURE_AVXVNNI; + out: disable_cpu_features_for_testing(&features, x86_cpu_feature_table, ARRAY_LEN(x86_cpu_feature_table)); diff --git a/lib/x86/cpu_features.h b/lib/x86/cpu_features.h index 10afd8dd..7bfa098c 100644 --- a/lib/x86/cpu_features.h +++ b/lib/x86/cpu_features.h @@ -56,6 +56,7 @@ #define X86_CPU_FEATURE_AVX512VL 0x00000100 #define X86_CPU_FEATURE_VPCLMULQDQ 0x00000200 #define X86_CPU_FEATURE_AVX512VNNI 0x00000400 +#define X86_CPU_FEATURE_AVXVNNI 0x00000800 #define HAVE_SSE2(features) (HAVE_SSE2_NATIVE || ((features) & X86_CPU_FEATURE_SSE2)) #define HAVE_PCLMULQDQ(features) (HAVE_PCLMULQDQ_NATIVE || ((features) & X86_CPU_FEATURE_PCLMULQDQ)) @@ -67,6 +68,7 @@ #define HAVE_AVX512VL(features) (HAVE_AVX512VL_NATIVE || ((features) & X86_CPU_FEATURE_AVX512VL)) #define HAVE_VPCLMULQDQ(features) (HAVE_VPCLMULQDQ_NATIVE || ((features) & X86_CPU_FEATURE_VPCLMULQDQ)) #define HAVE_AVX512VNNI(features) (HAVE_AVX512VNNI_NATIVE || ((features) & X86_CPU_FEATURE_AVX512VNNI)) +#define HAVE_AVXVNNI(features) (HAVE_AVXVNNI_NATIVE || ((features) & X86_CPU_FEATURE_AVXVNNI)) #if HAVE_DYNAMIC_X86_CPU_FEATURES #define X86_CPU_FEATURES_KNOWN 0x80000000 @@ -173,7 +175,7 @@ static inline u32 get_x86_cpu_features(void) { return 0; } # define HAVE_BMI2_INTRIN 0 #endif -/* AVX-512F */ +/* AVX512F */ #ifdef __AVX512F__ # define HAVE_AVX512F_NATIVE 1 #else @@ -186,7 +188,7 @@ static inline u32 get_x86_cpu_features(void) { return 0; } # define HAVE_AVX512F_INTRIN 0 #endif -/* AVX-512BW */ +/* AVX512BW */ #ifdef __AVX512BW__ # define HAVE_AVX512BW_NATIVE 1 #else @@ -199,7 +201,7 @@ static inline u32 get_x86_cpu_features(void) { return 0; } # define HAVE_AVX512BW_INTRIN 0 #endif -/* AVX-512VL */ +/* AVX512VL */ #ifdef __AVX512VL__ # define HAVE_AVX512VL_NATIVE 1 #else @@ -238,6 +240,19 @@ static inline u32 get_x86_cpu_features(void) { return 0; } # define HAVE_AVX512VNNI_INTRIN 0 #endif +/* AVX-VNNI */ +#ifdef __AVXVNNI__ +# define HAVE_AVXVNNI_NATIVE 1 +#else +# define HAVE_AVXVNNI_NATIVE 0 +#endif +#if HAVE_AVXVNNI_NATIVE || GCC_PREREQ(11, 1) || CLANG_PREREQ(12, 0, 0) || \ + defined(_MSC_VER) +# define HAVE_AVXVNNI_INTRIN 1 +#else +# define HAVE_AVXVNNI_INTRIN 0 +#endif + #endif /* ARCH_X86_32 || ARCH_X86_64 */ #endif /* LIB_X86_CPU_FEATURES_H */ diff --git a/scripts/checksum_benchmarks.sh b/scripts/checksum_benchmarks.sh index 6b418190..972ad357 100755 --- a/scripts/checksum_benchmarks.sh +++ b/scripts/checksum_benchmarks.sh @@ -157,11 +157,15 @@ echo { case $ARCH in i386|x86_64) - if have_cpu_features avx512_vnni avx512bw; then - do_benchmark "AVX512VNNI/AVX512BW" - disable_cpu_feature "avx512vnni" "-mno-avx512vnni" + if have_cpu_features avx512bw avx512_vnni; then + do_benchmark "AVX512BW/AVX512VNNI" + disable_cpu_feature "avx512_vnni" "-mno-avx512vnni" disable_cpu_feature "avx512bw" "-mno-avx512bw" fi + if have_cpu_features avx2 avx_vnni; then + do_benchmark "AVX2/AVX-VNNI" + disable_cpu_feature "avx_vnni" "-mno-avxvnni" + fi if have_cpu_feature avx2; then do_benchmark "AVX2" disable_cpu_feature "avx2" "-mno-avx2" diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh index 9bca9677..51a7f4b0 100755 --- a/scripts/run_tests.sh +++ b/scripts/run_tests.sh @@ -142,8 +142,9 @@ build_and_run_tests() if ! [[ "$CFLAGS" =~ "-march=native" ]] && ! $quick; then case "$ARCH" in i386|x86_64) - features+=(zmm avx512vnni vpclmulqdq avx512vl avx512bw - avx512f avx2 avx bmi2 pclmulqdq sse2) + features+=(zmm avx_vnni avx512_vnni vpclmulqdq + avx512vl avx512bw avx512f + avx2 avx bmi2 pclmulqdq sse2) ;; arm*|aarch*) features+=(dotprod sha3 crc32 pmull neon)