From 5d15bcedd009231bdcd5181b59f538ced73aef1f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Mar 2024 16:55:44 -0700 Subject: [PATCH] lib/x86/crc32: more optimizations - As was recently done in the Adler-32 code, take advantage of the fact that on recent x86 processors, vmovdqu with an aligned pointer is just as fast as vmovdqa. Don't waste time aligning the pointer unless the length is very large, and at the same time, handle all cases of len >= 8*VL using the main loop so that the 4*VL wide loop isn't needed. (Before, aligning the pointer was tied to whether the main loop was used or not, since the main loop used vmovdqa.) - Handle short lengths more efficiently. Instead of falling back to crc32_slice1() for all len < VL, use AVX-512 masking (when available) to handle 4 <= len <= 15, and use 128-bit vector instructions to handle 16 <= len < VL. - Document why the main loop uses a width of 8*VL instead of 4*VL. --- lib/x86/cpu_features.c | 3 - lib/x86/cpu_features.h | 23 ++-- lib/x86/crc32_impl.h | 21 ++- lib/x86/crc32_pclmul_template.h | 234 +++++++++++++++++--------------- scripts/checksum_benchmarks.sh | 4 +- 5 files changed, 148 insertions(+), 137 deletions(-) diff --git a/lib/x86/cpu_features.c b/lib/x86/cpu_features.c index 56601764..fa04d646 100644 --- a/lib/x86/cpu_features.c +++ b/lib/x86/cpu_features.c @@ -82,7 +82,6 @@ static const struct cpu_feature x86_cpu_feature_table[] = { {X86_CPU_FEATURE_AVX2, "avx2"}, {X86_CPU_FEATURE_BMI2, "bmi2"}, {X86_CPU_FEATURE_ZMM, "zmm"}, - {X86_CPU_FEATURE_AVX512F, "avx512f"}, {X86_CPU_FEATURE_AVX512BW, "avx512bw"}, {X86_CPU_FEATURE_AVX512VL, "avx512vl"}, {X86_CPU_FEATURE_VPCLMULQDQ, "vpclmulqdq"}, @@ -163,8 +162,6 @@ void libdeflate_init_x86_cpu_features(void) if (((xcr0 & 0xe6) == 0xe6) && allow_512bit_vectors(manufacturer, family, model)) features |= X86_CPU_FEATURE_ZMM; - if ((b & (1 << 16)) && ((xcr0 & 0xe6) == 0xe6)) - features |= X86_CPU_FEATURE_AVX512F; if ((b & (1 << 30)) && ((xcr0 & 0xe6) == 0xe6)) features |= X86_CPU_FEATURE_AVX512BW; if ((b & (1U << 31)) && ((xcr0 & 0xe6) == 0xe6)) diff --git a/lib/x86/cpu_features.h b/lib/x86/cpu_features.h index d5d3f2ac..8dda21fd 100644 --- a/lib/x86/cpu_features.h +++ b/lib/x86/cpu_features.h @@ -39,17 +39,16 @@ #define X86_CPU_FEATURE_BMI2 (1 << 4) /* * ZMM indicates whether 512-bit vectors (zmm registers) should be used. On - * some CPUs, to avoid downclocking issues we don't set ZMM even if the CPU - * supports it, i.e. even if AVX512F is set. On these CPUs, we may still use - * AVX-512 instructions, but only with ymm and xmm registers. + * some CPUs, to avoid downclocking issues we don't set ZMM even if the CPU and + * operating system support AVX-512. On these CPUs, we may still use AVX-512 + * instructions, but only with xmm and ymm registers. */ #define X86_CPU_FEATURE_ZMM (1 << 5) -#define X86_CPU_FEATURE_AVX512F (1 << 6) -#define X86_CPU_FEATURE_AVX512BW (1 << 7) -#define X86_CPU_FEATURE_AVX512VL (1 << 8) -#define X86_CPU_FEATURE_VPCLMULQDQ (1 << 9) -#define X86_CPU_FEATURE_AVX512VNNI (1 << 10) -#define X86_CPU_FEATURE_AVXVNNI (1 << 11) +#define X86_CPU_FEATURE_AVX512BW (1 << 6) +#define X86_CPU_FEATURE_AVX512VL (1 << 7) +#define X86_CPU_FEATURE_VPCLMULQDQ (1 << 8) +#define X86_CPU_FEATURE_AVX512VNNI (1 << 9) +#define X86_CPU_FEATURE_AVXVNNI (1 << 10) #if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) /* Runtime x86 CPU feature detection is supported. */ @@ -135,12 +134,6 @@ static inline u32 get_x86_cpu_features(void) { return 0; } # define HAVE_BMI2_NATIVE 0 #endif -#ifdef __AVX512F__ -# define HAVE_AVX512F(features) 1 -#else -# define HAVE_AVX512F(features) ((features) & X86_CPU_FEATURE_AVX512F) -#endif - #ifdef __AVX512BW__ # define HAVE_AVX512BW(features) 1 #else diff --git a/lib/x86/crc32_impl.h b/lib/x86/crc32_impl.h index 8b23b904..462e6d88 100644 --- a/lib/x86/crc32_impl.h +++ b/lib/x86/crc32_impl.h @@ -30,6 +30,19 @@ #include "cpu_features.h" +/* + * pshufb(x, shift_tab[len..len+15]) left shifts x by 16-len bytes. + * pshufb(x, shift_tab[len+16..len+31]) right shifts x by len bytes. + */ +static const u8 MAYBE_UNUSED shift_tab[48] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, +}; + #if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) /* PCLMULQDQ implementation */ # define crc32_x86_pclmulqdq crc32_x86_pclmulqdq @@ -88,7 +101,7 @@ */ # define crc32_x86_vpclmulqdq_avx512_vl256 crc32_x86_vpclmulqdq_avx512_vl256 # define SUFFIX _vpclmulqdq_avx512_vl256 -# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512vl") +# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512bw,avx512vl") # define VL 32 # define USE_SSE4_1 1 # define USE_AVX512 1 @@ -101,7 +114,7 @@ */ # define crc32_x86_vpclmulqdq_avx512_vl512 crc32_x86_vpclmulqdq_avx512_vl512 # define SUFFIX _vpclmulqdq_avx512_vl512 -# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512vl") +# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512bw,avx512vl") # define VL 64 # define USE_SSE4_1 1 # define USE_AVX512 1 @@ -116,12 +129,12 @@ arch_select_crc32_func(void) #ifdef crc32_x86_vpclmulqdq_avx512_vl512 if ((features & X86_CPU_FEATURE_ZMM) && HAVE_VPCLMULQDQ(features) && HAVE_PCLMULQDQ(features) && - HAVE_AVX512F(features) && HAVE_AVX512VL(features)) + HAVE_AVX512BW(features) && HAVE_AVX512VL(features)) return crc32_x86_vpclmulqdq_avx512_vl512; #endif #ifdef crc32_x86_vpclmulqdq_avx512_vl256 if (HAVE_VPCLMULQDQ(features) && HAVE_PCLMULQDQ(features) && - HAVE_AVX512F(features) && HAVE_AVX512VL(features)) + HAVE_AVX512BW(features) && HAVE_AVX512VL(features)) return crc32_x86_vpclmulqdq_avx512_vl256; #endif #ifdef crc32_x86_vpclmulqdq_avx2 diff --git a/lib/x86/crc32_pclmul_template.h b/lib/x86/crc32_pclmul_template.h index bb892d82..0fe38f8a 100644 --- a/lib/x86/crc32_pclmul_template.h +++ b/lib/x86/crc32_pclmul_template.h @@ -37,8 +37,8 @@ * VL=16 && USE_SSE4_1=0 && USE_AVX512=0: at least pclmul * VL=16 && USE_SSE4_1=1 && USE_AVX512=0: at least pclmul,sse4.1 * VL=32 && USE_SSE4_1=1 && USE_AVX512=0: at least vpclmulqdq,pclmul,avx2 - * VL=32 && USE_SSE4_1=1 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512vl - * VL=64 && USE_SSE4_1=1 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512vl + * VL=32 && USE_SSE4_1=1 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl + * VL=64 && USE_SSE4_1=1 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl * (Other combinations are not useful and have not been tested.) * VL: * Vector length in bytes. Must be 16, 32, or 64. @@ -162,18 +162,6 @@ static forceinline ATTRIBUTES __m128i ADD_SUFFIX(fold_lessthan16bytes)(__m128i x, const u8 *p, size_t len, __m128i /* __v2du */ mults_128b) { - /* - * pshufb(x, shift_tab[len..len+15]) left shifts x by 16-len bytes. - * pshufb(x, shift_tab[len+16..len+31]) right shifts x by len bytes. - */ - static const u8 shift_tab[48] = { - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, - 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - }; __m128i lshift = _mm_loadu_si128((const void *)&shift_tab[len]); __m128i rshift = _mm_loadu_si128((const void *)&shift_tab[len + 16]); __m128i x0, x1; @@ -216,70 +204,77 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len) const __m128i barrett_reduction_constants = _mm_set_epi64x(CRC32_BARRETT_CONSTANT_2, CRC32_BARRETT_CONSTANT_1); vec_t v0, v1, v2, v3, v4, v5, v6, v7; - __m128i x0, x1; - - /* - * There are two overall code paths. The first path supports all - * lengths, but is intended for short lengths; it uses unaligned loads - * and does at most 4-way folds. The second path only supports longer - * lengths, aligns the pointer in order to do aligned loads, and does up - * to 8-way folds. The length check below decides which path to take. - */ - if (len < 64*VL) { - if (len < VL) - return crc32_slice1(crc, p, len); - - v0 = VXOR(VLOADU(p), M128I_TO_VEC(_mm_cvtsi32_si128(crc))); - p += VL; + __m128i x0 = _mm_cvtsi32_si128(crc); + __m128i x1; - if (len >= 4*VL) { - v1 = VLOADU(p + 0*VL); - v2 = VLOADU(p + 1*VL); - v3 = VLOADU(p + 2*VL); - p += 3*VL; - while (len >= 8*VL) { - v0 = fold_vec(v0, VLOADU(p + 0*VL), mults_4v); - v1 = fold_vec(v1, VLOADU(p + 1*VL), mults_4v); - v2 = fold_vec(v2, VLOADU(p + 2*VL), mults_4v); - v3 = fold_vec(v3, VLOADU(p + 3*VL), mults_4v); - p += 4*VL; - len -= 4*VL; + if (len < 8*VL) { + if (len < VL) { + STATIC_ASSERT(VL == 16 || VL == 32 || VL == 64); + if (len < 16) { + #if USE_AVX512 + if (len < 4) + return crc32_slice1(crc, p, len); + /* + * Handle 4 <= len <= 15 bytes by doing a masked + * load, XOR'ing the current CRC with the first + * 4 bytes, left-shifting by '16 - len' bytes to + * align the result to the end of x0 (so that it + * becomes the low-order coefficients of a + * 128-bit polynomial), and then doing the usual + * reduction from 128 bits to 32 bits. + */ + x0 = _mm_xor_si128( + x0, _mm_maskz_loadu_epi8((1 << len) - 1, p)); + x0 = _mm_shuffle_epi8( + x0, _mm_loadu_si128((const void *)&shift_tab[len])); + goto reduce_x0; + #else + return crc32_slice1(crc, p, len); + #endif } - v0 = fold_vec(v0, v2, mults_2v); - v1 = fold_vec(v1, v3, mults_2v); - if (len & (2*VL)) { - v0 = fold_vec(v0, VLOADU(p + 0*VL), mults_2v); - v1 = fold_vec(v1, VLOADU(p + 1*VL), mults_2v); - p += 2*VL; - } - v0 = fold_vec(v0, v1, mults_1v); - if (len & VL) { - v0 = fold_vec(v0, VLOADU(p), mults_1v); - p += VL; - } - } else { - if (len >= 2*VL) { - v0 = fold_vec(v0, VLOADU(p), mults_1v); - p += VL; - if (len >= 3*VL) { - v0 = fold_vec(v0, VLOADU(p), mults_1v); - p += VL; - } + /* + * Handle 16 <= len < VL bytes where VL is 32 or 64. + * Use 128-bit instructions so that these lengths aren't + * slower with VL > 16 than with VL=16. + */ + x0 = _mm_xor_si128(_mm_loadu_si128((const void *)p), x0); + if (len >= 32) { + x0 = fold_vec128(x0, _mm_loadu_si128((const void *)(p + 16)), + mults_128b); + if (len >= 48) + x0 = fold_vec128(x0, _mm_loadu_si128((const void *)(p + 32)), + mults_128b); } + p += len & ~15; + goto less_than_16_remaining; } + v0 = VXOR(VLOADU(p), M128I_TO_VEC(x0)); + if (len < 2*VL) { + p += VL; + goto less_than_vl_remaining; + } + v1 = VLOADU(p + 1*VL); + if (len < 4*VL) { + p += 2*VL; + goto less_than_2vl_remaining; + } + v2 = VLOADU(p + 2*VL); + v3 = VLOADU(p + 3*VL); + p += 4*VL; } else { - size_t align = -(uintptr_t)p & (VL-1); - const vec_t *vp; + /* + * If the length is large and the pointer is misaligned, align + * it. For smaller lengths, just take the misaligned load + * penalty. Note that on recent x86 CPUs, vmovdqu with an + * aligned address is just as fast as vmovdqa, so there's no + * need to use vmovdqa in the main loop. + */ + if (len > 65536 && ((uintptr_t)p & (VL-1))) { + size_t align = -(uintptr_t)p & (VL-1); - /* Align p to the next VL-byte boundary. */ - if (align == 0) { - vp = (const vec_t *)p; - v0 = VXOR(*vp++, M128I_TO_VEC(_mm_cvtsi32_si128(crc))); - } else { len -= align; #if USE_SSE4_1 - x0 = _mm_xor_si128(_mm_loadu_si128((const void *)p), - _mm_cvtsi32_si128(crc)); + x0 = _mm_xor_si128(_mm_loadu_si128((const void *)p), x0); p += 16; if (align & 15) { x0 = fold_lessthan16bytes(x0, p, align & 15, @@ -287,7 +282,7 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len) p += align & 15; align &= ~15; } - while (align >= 16) { + while (align) { x0 = fold_vec128(x0, *(const __m128i *)p, mults_128b); p += 16; @@ -296,66 +291,75 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len) v0 = M128I_TO_VEC(x0); # if VL == 32 v0 = _mm256_inserti128_si256(v0, *(const __m128i *)p, 1); - p += 16; # elif VL == 64 v0 = _mm512_inserti32x4(v0, *(const __m128i *)p, 1); - p += 16; - v0 = _mm512_inserti64x4(v0, *(const __m256i *)p, 1); - p += 32; + v0 = _mm512_inserti64x4(v0, *(const __m256i *)(p + 16), 1); # endif - vp = (const vec_t *)p; + p -= 16; #else crc = crc32_slice1(crc, p, align); p += align; - vp = (const vec_t *)p; - v0 = VXOR(*vp++, M128I_TO_VEC(_mm_cvtsi32_si128(crc))); + v0 = VXOR(VLOADU(p), M128I_TO_VEC(_mm_cvtsi32_si128(crc))); #endif + } else { + v0 = VXOR(VLOADU(p), M128I_TO_VEC(x0)); } - v1 = *vp++; - v2 = *vp++; - v3 = *vp++; - v4 = *vp++; - v5 = *vp++; - v6 = *vp++; - v7 = *vp++; - do { - v0 = fold_vec(v0, *vp++, mults_8v); - v1 = fold_vec(v1, *vp++, mults_8v); - v2 = fold_vec(v2, *vp++, mults_8v); - v3 = fold_vec(v3, *vp++, mults_8v); - v4 = fold_vec(v4, *vp++, mults_8v); - v5 = fold_vec(v5, *vp++, mults_8v); - v6 = fold_vec(v6, *vp++, mults_8v); - v7 = fold_vec(v7, *vp++, mults_8v); - len -= 8*VL; - } while (len >= 16*VL); + v1 = VLOADU(p + 1*VL); + v2 = VLOADU(p + 2*VL); + v3 = VLOADU(p + 3*VL); + v4 = VLOADU(p + 4*VL); + v5 = VLOADU(p + 5*VL); + v6 = VLOADU(p + 6*VL); + v7 = VLOADU(p + 7*VL); + p += 8*VL; /* - * Reduce v0-v7 (length 8*VL bytes) to v0 (length VL bytes) - * and fold in any VL-byte data segments that remain. + * This is the main loop, processing 8*VL bytes per iteration. + * 4*VL is usually enough and would result in smaller code, but + * Skylake and Cascade Lake need 8*VL to get full performance. */ + while (len >= 16*VL) { + v0 = fold_vec(v0, VLOADU(p + 0*VL), mults_8v); + v1 = fold_vec(v1, VLOADU(p + 1*VL), mults_8v); + v2 = fold_vec(v2, VLOADU(p + 2*VL), mults_8v); + v3 = fold_vec(v3, VLOADU(p + 3*VL), mults_8v); + v4 = fold_vec(v4, VLOADU(p + 4*VL), mults_8v); + v5 = fold_vec(v5, VLOADU(p + 5*VL), mults_8v); + v6 = fold_vec(v6, VLOADU(p + 6*VL), mults_8v); + v7 = fold_vec(v7, VLOADU(p + 7*VL), mults_8v); + p += 8*VL; + len -= 8*VL; + } + + /* Fewer than 8*VL bytes remain. */ v0 = fold_vec(v0, v4, mults_4v); v1 = fold_vec(v1, v5, mults_4v); v2 = fold_vec(v2, v6, mults_4v); v3 = fold_vec(v3, v7, mults_4v); if (len & (4*VL)) { - v0 = fold_vec(v0, *vp++, mults_4v); - v1 = fold_vec(v1, *vp++, mults_4v); - v2 = fold_vec(v2, *vp++, mults_4v); - v3 = fold_vec(v3, *vp++, mults_4v); - } - v0 = fold_vec(v0, v2, mults_2v); - v1 = fold_vec(v1, v3, mults_2v); - if (len & (2*VL)) { - v0 = fold_vec(v0, *vp++, mults_2v); - v1 = fold_vec(v1, *vp++, mults_2v); + v0 = fold_vec(v0, VLOADU(p + 0*VL), mults_4v); + v1 = fold_vec(v1, VLOADU(p + 1*VL), mults_4v); + v2 = fold_vec(v2, VLOADU(p + 2*VL), mults_4v); + v3 = fold_vec(v3, VLOADU(p + 3*VL), mults_4v); + p += 4*VL; } - v0 = fold_vec(v0, v1, mults_1v); - if (len & VL) - v0 = fold_vec(v0, *vp++, mults_1v); - p = (const u8 *)vp; } - + /* Fewer than 4*VL bytes remain. */ + v0 = fold_vec(v0, v2, mults_2v); + v1 = fold_vec(v1, v3, mults_2v); + if (len & (2*VL)) { + v0 = fold_vec(v0, VLOADU(p + 0*VL), mults_2v); + v1 = fold_vec(v1, VLOADU(p + 1*VL), mults_2v); + p += 2*VL; + } +less_than_2vl_remaining: + /* Fewer than 2*VL bytes remain. */ + v0 = fold_vec(v0, v1, mults_1v); + if (len & VL) { + v0 = fold_vec(v0, VLOADU(p), mults_1v); + p += VL; + } +less_than_vl_remaining: /* * Fewer than VL bytes remain. Reduce v0 (length VL bytes) to x0 * (length 16 bytes) and fold in any 16-byte data segments that remain. @@ -388,6 +392,7 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len) p += 16; } #endif +less_than_16_remaining: len &= 15; /* @@ -397,6 +402,9 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len) #if USE_SSE4_1 if (len) x0 = fold_lessthan16bytes(x0, p, len, mults_128b); +#endif +#if USE_AVX512 +reduce_x0: #endif /* diff --git a/scripts/checksum_benchmarks.sh b/scripts/checksum_benchmarks.sh index 12fb405f..94f281aa 100755 --- a/scripts/checksum_benchmarks.sh +++ b/scripts/checksum_benchmarks.sh @@ -153,12 +153,12 @@ export LIBDEFLATE_DISABLE_CPU_FEATURES="" { case $ARCH in i386|x86_64) - if have_cpu_features vpclmulqdq pclmulqdq avx512f avx512vl; then + if have_cpu_features vpclmulqdq pclmulqdq avx512bw avx512vl; then do_benchmark "VPCLMULQDQ/AVX512/VL512" disable_cpu_feature zmm do_benchmark "VPCLMULQDQ/AVX512/VL256" disable_cpu_feature avx512vl "-mno-avx512vl" - disable_cpu_feature avx512f "-mno-avx512f" + disable_cpu_feature avx512bw "-mno-avx512bw" fi if have_cpu_features vpclmulqdq pclmulqdq avx2; then do_benchmark "VPCLMULQDQ/AVX2"