diff --git a/lib/arm/adler32_impl.h b/lib/arm/adler32_impl.h index 99a5f3f9..e411fd3b 100644 --- a/lib/arm/adler32_impl.h +++ b/lib/arm/adler32_impl.h @@ -43,7 +43,7 @@ # endif # endif # include -static u32 ATTRIBUTES MAYBE_UNUSED +static ATTRIBUTES MAYBE_UNUSED u32 adler32_arm_neon(u32 adler, const u8 *p, size_t len) { static const u16 _aligned_attribute(16) mults[64] = { @@ -225,7 +225,7 @@ adler32_arm_neon(u32 adler, const u8 *p, size_t len) # endif # endif # include -static u32 ATTRIBUTES +static ATTRIBUTES u32 adler32_arm_neon_dotprod(u32 adler, const u8 *p, size_t len) { static const u8 _aligned_attribute(16) mults[64] = { diff --git a/lib/arm/crc32_impl.h b/lib/arm/crc32_impl.h index d6ea30c0..d52954a6 100644 --- a/lib/arm/crc32_impl.h +++ b/lib/arm/crc32_impl.h @@ -113,7 +113,7 @@ combine_crcs_slow(u32 crc0, u32 crc1, u32 crc2, u32 crc3) } #define crc32_arm_crc crc32_arm_crc -static u32 ATTRIBUTES MAYBE_UNUSED +static ATTRIBUTES MAYBE_UNUSED u32 crc32_arm_crc(u32 crc, const u8 *p, size_t len) { if (len >= 64) { @@ -289,7 +289,7 @@ combine_crcs_fast(u32 crc0, u32 crc1, u32 crc2, u32 crc3, size_t i) } #define crc32_arm_crc_pmullcombine crc32_arm_crc_pmullcombine -static u32 ATTRIBUTES MAYBE_UNUSED +static ATTRIBUTES MAYBE_UNUSED u32 crc32_arm_crc_pmullcombine(u32 crc, const u8 *p, size_t len) { const size_t align = -(uintptr_t)p & 7; @@ -470,7 +470,7 @@ crc32_arm_crc_pmullcombine(u32 crc, const u8 *p, size_t len) # define ENABLE_EOR3 0 # include "crc32_pmull_helpers.h" -static u32 ATTRIBUTES MAYBE_UNUSED +static ATTRIBUTES MAYBE_UNUSED u32 crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len) { static const u64 _aligned_attribute(16) mults[3][2] = { diff --git a/lib/arm/crc32_pmull_wide.h b/lib/arm/crc32_pmull_wide.h index c2f8af06..5a4bd0ca 100644 --- a/lib/arm/crc32_pmull_wide.h +++ b/lib/arm/crc32_pmull_wide.h @@ -52,7 +52,7 @@ #include "crc32_pmull_helpers.h" -static u32 ATTRIBUTES MAYBE_UNUSED +static ATTRIBUTES MAYBE_UNUSED u32 ADD_SUFFIX(crc32_arm)(u32 crc, const u8 *p, size_t len) { uint8x16_t v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11; diff --git a/lib/decompress_template.h b/lib/decompress_template.h index 3c1da677..8c874c36 100644 --- a/lib/decompress_template.h +++ b/lib/decompress_template.h @@ -41,7 +41,7 @@ # define EXTRACT_VARBITS8(word, count) ((word) & BITMASK((u8)(count))) #endif -static enum libdeflate_result ATTRIBUTES MAYBE_UNUSED +static ATTRIBUTES MAYBE_UNUSED enum libdeflate_result FUNCNAME(struct libdeflate_decompressor * restrict d, const void * restrict in, size_t in_nbytes, void * restrict out, size_t out_nbytes_avail, diff --git a/lib/x86/adler32_impl.h b/lib/x86/adler32_impl.h index a32af413..ba559e6e 100644 --- a/lib/x86/adler32_impl.h +++ b/lib/x86/adler32_impl.h @@ -33,19 +33,19 @@ /* SSE2 and AVX2 implementations. Used on older CPUs. */ #if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) # define adler32_x86_sse2 adler32_x86_sse2 -# define SUFFIX _x86_sse2 +# define SUFFIX _sse2 # define ATTRIBUTES _target_attribute("sse2") # define VL 16 # define USE_VNNI 0 -# define USE_MASKING 0 +# define USE_AVX512 0 # include "adler32_template.h" # define adler32_x86_avx2 adler32_x86_avx2 -# define SUFFIX _x86_avx2 +# define SUFFIX _avx2 # define ATTRIBUTES _target_attribute("avx2") # define VL 32 # define USE_VNNI 0 -# define USE_MASKING 0 +# define USE_AVX512 0 # include "adler32_template.h" #endif @@ -55,11 +55,11 @@ */ #if GCC_PREREQ(11, 1) || CLANG_PREREQ(12, 0, 13000000) || MSVC_PREREQ(1930) # define adler32_x86_avx2_vnni adler32_x86_avx2_vnni -# define SUFFIX _x86_avx2_vnni +# define SUFFIX _avx2_vnni # define ATTRIBUTES _target_attribute("avx2,avxvnni") # define VL 32 # define USE_VNNI 1 -# define USE_MASKING 0 +# define USE_AVX512 0 # include "adler32_template.h" #endif @@ -72,11 +72,11 @@ * that support AVX10/256 but not AVX10/512. */ # define adler32_x86_avx512_vl256_vnni adler32_x86_avx512_vl256_vnni -# define SUFFIX _x86_avx512_vl256_vnni +# define SUFFIX _avx512_vl256_vnni # define ATTRIBUTES _target_attribute("avx512bw,avx512vl,avx512vnni") # define VL 32 # define USE_VNNI 1 -# define USE_MASKING 1 +# define USE_AVX512 1 # include "adler32_template.h" /* @@ -85,11 +85,11 @@ * the optimal implementation on CPUs that support AVX10/512. */ # define adler32_x86_avx512_vl512_vnni adler32_x86_avx512_vl512_vnni -# define SUFFIX _x86_avx512_vl512_vnni +# define SUFFIX _avx512_vl512_vnni # define ATTRIBUTES _target_attribute("avx512bw,avx512vnni") # define VL 64 # define USE_VNNI 1 -# define USE_MASKING 1 +# define USE_AVX512 1 # include "adler32_template.h" #endif diff --git a/lib/x86/adler32_template.h b/lib/x86/adler32_template.h index c788acc5..1593ee5a 100644 --- a/lib/x86/adler32_template.h +++ b/lib/x86/adler32_template.h @@ -34,20 +34,21 @@ * ATTRIBUTES: * Target function attributes to use. Must satisfy the dependencies of the * other parameters as follows: - * VL=16 && USE_VNNI=0 && USE_MASKING=0: at least sse2 - * VL=32 && USE_VNNI=0 && USE_MASKING=0: at least avx2 - * VL=32 && USE_VNNI=1 && USE_MASKING=0: at least avx2,avxvnni - * VL=32 && USE_VNNI=1 && USE_MASKING=1: at least avx512bw,avx512vl,avx512vnni - * VL=64 && USE_VNNI=1 && USE_MASKING=1: at least avx512bw,avx512vnni + * VL=16 && USE_VNNI=0 && USE_AVX512=0: at least sse2 + * VL=32 && USE_VNNI=0 && USE_AVX512=0: at least avx2 + * VL=32 && USE_VNNI=1 && USE_AVX512=0: at least avx2,avxvnni + * VL=32 && USE_VNNI=1 && USE_AVX512=1: at least avx512bw,avx512vl,avx512vnni + * VL=64 && USE_VNNI=1 && USE_AVX512=1: at least avx512bw,avx512vnni * (Other combinations are not useful and have not been tested.) * VL: - * Vector length in bytes. Must be 16, 32, and 64. + * Vector length in bytes. Must be 16, 32, or 64. * USE_VNNI: * If 1, use the VNNI dot product based algorithm. * If 0, use the legacy SSE2 and AVX2 compatible algorithm. - * USE_MASKING: - * If 1, use AVX-512 features such as masking. - * If 0, assume that the CPU might not support AVX-512. + * USE_AVX512: + * If 1, take advantage of AVX-512 features such as masking. This doesn't + * enable the use of 512-bit vectors; the vector length is controlled by + * VL. If 0, assume that the CPU might not support AVX-512. */ #if VL == 16 @@ -57,7 +58,7 @@ # define VADD8(a, b) _mm_add_epi8((a), (b)) # define VADD16(a, b) _mm_add_epi16((a), (b)) # define VADD32(a, b) _mm_add_epi32((a), (b)) -# if USE_MASKING +# if USE_AVX512 # define VDPBUSD(a, b, c) _mm_dpbusd_epi32((a), (b), (c)) # else # define VDPBUSD(a, b, c) _mm_dpbusd_avx_epi32((a), (b), (c)) @@ -68,12 +69,12 @@ # define VMASKZ_LOADU(mask, p) _mm_maskz_loadu_epi8((mask), (p)) # define VMULLO32(a, b) _mm_mullo_epi32((a), (b)) # define VSAD8(a, b) _mm_sad_epu8((a), (b)) -# define VSET1_32(a) _mm_set1_epi32(a) # define VSET1_8(a) _mm_set1_epi8(a) +# define VSET1_32(a) _mm_set1_epi32(a) # define VSETZERO() _mm_setzero_si128() # define VSLL32(a, b) _mm_slli_epi32((a), (b)) -# define VUNPACKHI8(a, b) _mm_unpackhi_epi8((a), (b)) # define VUNPACKLO8(a, b) _mm_unpacklo_epi8((a), (b)) +# define VUNPACKHI8(a, b) _mm_unpackhi_epi8((a), (b)) #elif VL == 32 # define vec_t __m256i # define mask_t u32 @@ -81,7 +82,7 @@ # define VADD8(a, b) _mm256_add_epi8((a), (b)) # define VADD16(a, b) _mm256_add_epi16((a), (b)) # define VADD32(a, b) _mm256_add_epi32((a), (b)) -# if USE_MASKING +# if USE_AVX512 # define VDPBUSD(a, b, c) _mm256_dpbusd_epi32((a), (b), (c)) # else # define VDPBUSD(a, b, c) _mm256_dpbusd_avx_epi32((a), (b), (c)) @@ -92,27 +93,32 @@ # define VMASKZ_LOADU(mask, p) _mm256_maskz_loadu_epi8((mask), (p)) # define VMULLO32(a, b) _mm256_mullo_epi32((a), (b)) # define VSAD8(a, b) _mm256_sad_epu8((a), (b)) -# define VSET1_32(a) _mm256_set1_epi32(a) # define VSET1_8(a) _mm256_set1_epi8(a) +# define VSET1_32(a) _mm256_set1_epi32(a) # define VSETZERO() _mm256_setzero_si256() # define VSLL32(a, b) _mm256_slli_epi32((a), (b)) -# define VUNPACKHI8(a, b) _mm256_unpackhi_epi8((a), (b)) # define VUNPACKLO8(a, b) _mm256_unpacklo_epi8((a), (b)) +# define VUNPACKHI8(a, b) _mm256_unpackhi_epi8((a), (b)) #elif VL == 64 # define vec_t __m512i # define mask_t u64 # define LOG2_VL 6 # define VADD8(a, b) _mm512_add_epi8((a), (b)) +# define VADD16(a, b) _mm512_add_epi16((a), (b)) # define VADD32(a, b) _mm512_add_epi32((a), (b)) # define VDPBUSD(a, b, c) _mm512_dpbusd_epi32((a), (b), (c)) # define VLOAD(p) _mm512_load_si512((const void *)(p)) # define VLOADU(p) _mm512_loadu_si512((const void *)(p)) +# define VMADD16(a, b) _mm512_madd_epi16((a), (b)) # define VMASKZ_LOADU(mask, p) _mm512_maskz_loadu_epi8((mask), (p)) # define VMULLO32(a, b) _mm512_mullo_epi32((a), (b)) -# define VSET1_32(a) _mm512_set1_epi32(a) +# define VSAD8(a, b) _mm512_sad_epu8((a), (b)) # define VSET1_8(a) _mm512_set1_epi8(a) +# define VSET1_32(a) _mm512_set1_epi32(a) # define VSETZERO() _mm512_setzero_si512() # define VSLL32(a, b) _mm512_slli_epi32((a), (b)) +# define VUNPACKLO8(a, b) _mm512_unpacklo_epi8((a), (b)) +# define VUNPACKHI8(a, b) _mm512_unpackhi_epi8((a), (b)) #else # error "unsupported vector length" #endif @@ -173,8 +179,8 @@ ADD_SUFFIX(reduce_to_32bits)(vec_t v_s1, vec_t v_s2, u32 *s1_p, u32 *s2_p) } #define reduce_to_32bits ADD_SUFFIX(reduce_to_32bits) -static u32 ATTRIBUTES -ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len) +static ATTRIBUTES u32 +ADD_SUFFIX(adler32_x86)(u32 adler, const u8 *p, size_t len) { #if USE_VNNI /* This contains the bytes [VL, VL-1, VL-2, ..., 1]. */ @@ -235,7 +241,7 @@ ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len) #if USE_VNNI /* - * This is Adler-32 using the vpdpbusd instruction from AVX512-VNNI or + * This is Adler-32 using the vpdpbusd instruction from AVX512VNNI or * AVX-VNNI. vpdpbusd multiplies the unsigned bytes of one vector by * the signed bytes of another vector and adds the sums in groups of 4 * to the 32-bit elements of a third vector. We use it in two ways: @@ -369,7 +375,7 @@ ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len) * Process the last 0 < n <= VL bytes of the chunk. * Utilize a masked load if it's available. */ - #if USE_MASKING + #if USE_AVX512 data = VMASKZ_LOADU((mask_t)-1 >> (VL - n), p); #else data = zeroes; @@ -414,7 +420,7 @@ ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len) * v_byte_sums_* counter is guaranteed to not exceed INT16_MAX. * It's INT16_MAX, not UINT16_MAX, because v_byte_sums_* are * used with pmaddwd which does signed multiplication. In the - * SSE2 case this limits chunks to 4096 bytes instead of 5504. + * SSE2 case this limits chunks to 4096 bytes instead of 5536. */ size_t n = MIN(len, MIN(2 * VL * (INT16_MAX / UINT8_MAX), MAX_CHUNK_LEN) & ~(2*VL - 1)); @@ -502,11 +508,11 @@ ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len) #undef VSET1_32 #undef VSETZERO #undef VSLL32 -#undef VUNPACKHI8 #undef VUNPACKLO8 +#undef VUNPACKHI8 #undef SUFFIX #undef ATTRIBUTES #undef VL #undef USE_VNNI -#undef USE_MASKING +#undef USE_AVX512 diff --git a/lib/x86/crc32_impl.h b/lib/x86/crc32_impl.h index 3d8e254d..8b23b904 100644 --- a/lib/x86/crc32_impl.h +++ b/lib/x86/crc32_impl.h @@ -36,8 +36,8 @@ # define SUFFIX _pclmulqdq # define ATTRIBUTES _target_attribute("pclmul") # define VL 16 -# define FOLD_LESSTHAN16BYTES 0 -# define USE_TERNARYLOGIC 0 +# define USE_SSE4_1 0 +# define USE_AVX512 0 # include "crc32_pclmul_template.h" /* @@ -49,55 +49,62 @@ * non-destructive VEX-encoded instructions. Second, AVX support implies SSSE3 * and SSE4.1 support, and we can use SSSE3 and SSE4.1 intrinsics for efficient * handling of partial blocks. (We *could* compile a variant with - * PCLMULQDQ+SSE4.1 without AVX, but for simplicity we don't currently bother.) + * PCLMULQDQ+SSE4.1 without AVX, but for simplicity we currently don't bother.) */ # define crc32_x86_pclmulqdq_avx crc32_x86_pclmulqdq_avx # define SUFFIX _pclmulqdq_avx # define ATTRIBUTES _target_attribute("pclmul,avx") # define VL 16 -# define FOLD_LESSTHAN16BYTES 1 -# define USE_TERNARYLOGIC 0 +# define USE_SSE4_1 1 +# define USE_AVX512 0 # include "crc32_pclmul_template.h" #endif /* - * VPCLMULQDQ/AVX2 implementation. Uses 256-bit vectors. + * VPCLMULQDQ/AVX2 implementation. This is used on CPUs that have AVX2 and + * VPCLMULQDQ but don't have AVX-512, for example Intel Alder Lake. * * Currently this can't be enabled with MSVC because MSVC has a bug where it * incorrectly assumes that VPCLMULQDQ implies AVX-512: - * https://developercommunity.visualstudio.com/t/Compiler-incorrectly-assumes-VAES-and-VP/10578785?space=62&q=AVX512&sort=newest + * https://developercommunity.visualstudio.com/t/Compiler-incorrectly-assumes-VAES-and-VP/10578785 */ #if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) # define crc32_x86_vpclmulqdq_avx2 crc32_x86_vpclmulqdq_avx2 # define SUFFIX _vpclmulqdq_avx2 # define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx2") # define VL 32 -# define FOLD_LESSTHAN16BYTES 1 -# define USE_TERNARYLOGIC 0 +# define USE_SSE4_1 1 +# define USE_AVX512 0 # include "crc32_pclmul_template.h" #endif #if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920) /* - * VPCLMULQDQ/AVX512 implementation with 256-bit vectors. This takes advantage - * of some AVX-512 instructions but uses 256-bit vectors rather than 512-bit. - * This can be useful on CPUs where 512-bit vectors cause downclocking. + * VPCLMULQDQ/AVX512 implementation using 256-bit vectors. This is very similar + * to the VPCLMULQDQ/AVX2 implementation but takes advantage of the vpternlog + * instruction and more registers. This is used on CPUs that support AVX-512 + * but where using 512-bit vectors causes downclocking. This should also be the + * optimal implementation on CPUs that support AVX10/256 but not AVX10/512. */ # define crc32_x86_vpclmulqdq_avx512_vl256 crc32_x86_vpclmulqdq_avx512_vl256 # define SUFFIX _vpclmulqdq_avx512_vl256 # define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512vl") # define VL 32 -# define FOLD_LESSTHAN16BYTES 1 -# define USE_TERNARYLOGIC 1 +# define USE_SSE4_1 1 +# define USE_AVX512 1 # include "crc32_pclmul_template.h" -/* VPCLMULQDQ/AVX512 implementation with 512-bit vectors */ +/* + * VPCLMULQDQ/AVX512 implementation using 512-bit vectors. This is used on CPUs + * that have a good AVX-512 implementation including VPCLMULQDQ. This should + * also be the optimal implementation on CPUs that support AVX10/512. + */ # define crc32_x86_vpclmulqdq_avx512_vl512 crc32_x86_vpclmulqdq_avx512_vl512 # define SUFFIX _vpclmulqdq_avx512_vl512 # define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512vl") # define VL 64 -# define FOLD_LESSTHAN16BYTES 1 -# define USE_TERNARYLOGIC 1 +# define USE_SSE4_1 1 +# define USE_AVX512 1 # include "crc32_pclmul_template.h" #endif diff --git a/lib/x86/crc32_pclmul_template.h b/lib/x86/crc32_pclmul_template.h index 4257d449..bb892d82 100644 --- a/lib/x86/crc32_pclmul_template.h +++ b/lib/x86/crc32_pclmul_template.h @@ -34,18 +34,22 @@ * ATTRIBUTES: * Target function attributes to use. Must satisfy the dependencies of the * other parameters as follows: - * VL=16 && FOLD_LESSTHAN16BYTES=0: at least pclmul - * VL=16 && FOLD_LESSTHAN16BYTES=1: at least pclmul,sse4.1 - * VL=32 && USE_TERNARYLOGIC=0: at least vpclmulqdq,pclmul,avx2 - * VL=32 && USE_TERNARYLOGIC=1: at least vpclmulqdq,pclmul,avx512vl - * VL=64: at least vpclmulqdq,pclmul,avx512vl + * VL=16 && USE_SSE4_1=0 && USE_AVX512=0: at least pclmul + * VL=16 && USE_SSE4_1=1 && USE_AVX512=0: at least pclmul,sse4.1 + * VL=32 && USE_SSE4_1=1 && USE_AVX512=0: at least vpclmulqdq,pclmul,avx2 + * VL=32 && USE_SSE4_1=1 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512vl + * VL=64 && USE_SSE4_1=1 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512vl + * (Other combinations are not useful and have not been tested.) * VL: - * Vector length in bytes. Supported values are 16, 32, and 64. - * FOLD_LESSTHAN16BYTES: - * Use vector instructions to handle any partial blocks at the beginning - * and end, instead of falling back to scalar instructions for those parts. - * USE_TERNARYLOGIC: - * Use the vpternlog instruction to do three-argument XORs. + * Vector length in bytes. Must be 16, 32, or 64. + * USE_SSE4_1: + * If 1, take advantage of SSE4.1 instructions such as pblendvb. + * If 0, assume that the CPU might not support SSE4.1. + * USE_AVX512: + * If 1, take advantage of AVX-512 features such as masking and the + * vpternlog instruction. This doesn't enable the use of 512-bit vectors; + * the vector length is controlled by VL. If 0, assume that the CPU might + * not support AVX-512. * * The overall algorithm used is CRC folding with carryless multiplication * instructions. Note that the x86 crc32 instruction cannot be used, as it is @@ -62,55 +66,10 @@ * or AVX512VL, or four in combination with AVX512F. */ -#undef fold_vec128 -static forceinline ATTRIBUTES __m128i -ADD_SUFFIX(fold_vec128)(__m128i src, __m128i dst, __m128i multipliers) -{ - dst = _mm_xor_si128(dst, _mm_clmulepi64_si128(src, multipliers, 0x00)); - dst = _mm_xor_si128(dst, _mm_clmulepi64_si128(src, multipliers, 0x11)); - return dst; -} -#define fold_vec128 ADD_SUFFIX(fold_vec128) - -#if VL >= 32 -#undef fold_vec256 -static forceinline ATTRIBUTES __m256i -ADD_SUFFIX(fold_vec256)(__m256i src, __m256i dst, __m256i multipliers) -{ -#if USE_TERNARYLOGIC - return _mm256_ternarylogic_epi32( - _mm256_clmulepi64_epi128(src, multipliers, 0x00), - _mm256_clmulepi64_epi128(src, multipliers, 0x11), - dst, - 0x96); -#else - return _mm256_xor_si256( - _mm256_xor_si256(dst, - _mm256_clmulepi64_epi128(src, multipliers, 0x00)), - _mm256_clmulepi64_epi128(src, multipliers, 0x11)); -#endif -} -#define fold_vec256 ADD_SUFFIX(fold_vec256) -#endif /* VL >= 32 */ - -#if VL >= 64 -#undef fold_vec512 -static forceinline ATTRIBUTES __m512i -ADD_SUFFIX(fold_vec512)(__m512i src, __m512i dst, __m512i multipliers) -{ - return _mm512_ternarylogic_epi32( - _mm512_clmulepi64_epi128(src, multipliers, 0x00), - _mm512_clmulepi64_epi128(src, multipliers, 0x11), - dst, - 0x96); -} -#define fold_vec512 ADD_SUFFIX(fold_vec512) -#endif /* VL >= 64 */ - #if VL == 16 # define vec_t __m128i # define fold_vec fold_vec128 -# define VLOAD_UNALIGNED(p) _mm_loadu_si128((const void *)(p)) +# define VLOADU(p) _mm_loadu_si128((const void *)(p)) # define VXOR(a, b) _mm_xor_si128((a), (b)) # define M128I_TO_VEC(a) a # define MULTS_8V _mm_set_epi64x(CRC32_X991_MODG, CRC32_X1055_MODG) @@ -120,7 +79,7 @@ ADD_SUFFIX(fold_vec512)(__m512i src, __m512i dst, __m512i multipliers) #elif VL == 32 # define vec_t __m256i # define fold_vec fold_vec256 -# define VLOAD_UNALIGNED(p) _mm256_loadu_si256((const void *)(p)) +# define VLOADU(p) _mm256_loadu_si256((const void *)(p)) # define VXOR(a, b) _mm256_xor_si256((a), (b)) # define M128I_TO_VEC(a) _mm256_castsi128_si256(a) # define MULTS(a, b) _mm256_set_epi64x(a, b, a, b) @@ -131,7 +90,7 @@ ADD_SUFFIX(fold_vec512)(__m512i src, __m512i dst, __m512i multipliers) #elif VL == 64 # define vec_t __m512i # define fold_vec fold_vec512 -# define VLOAD_UNALIGNED(p) _mm512_loadu_si512((const void *)(p)) +# define VLOADU(p) _mm512_loadu_si512((const void *)(p)) # define VXOR(a, b) _mm512_xor_si512((a), (b)) # define M128I_TO_VEC(a) _mm512_castsi128_si512(a) # define MULTS(a, b) _mm512_set_epi64(a, b, a, b, a, b, a, b) @@ -143,7 +102,54 @@ ADD_SUFFIX(fold_vec512)(__m512i src, __m512i dst, __m512i multipliers) # error "unsupported vector length" #endif -#if FOLD_LESSTHAN16BYTES +#undef fold_vec128 +static forceinline ATTRIBUTES __m128i +ADD_SUFFIX(fold_vec128)(__m128i src, __m128i dst, __m128i /* __v2du */ mults) +{ + dst = _mm_xor_si128(dst, _mm_clmulepi64_si128(src, mults, 0x00)); + dst = _mm_xor_si128(dst, _mm_clmulepi64_si128(src, mults, 0x11)); + return dst; +} +#define fold_vec128 ADD_SUFFIX(fold_vec128) + +#if VL >= 32 +#undef fold_vec256 +static forceinline ATTRIBUTES __m256i +ADD_SUFFIX(fold_vec256)(__m256i src, __m256i dst, __m256i /* __v4du */ mults) +{ +#if USE_AVX512 + /* vpternlog with immediate 0x96 is a three-argument XOR. */ + return _mm256_ternarylogic_epi32( + _mm256_clmulepi64_epi128(src, mults, 0x00), + _mm256_clmulepi64_epi128(src, mults, 0x11), + dst, + 0x96); +#else + return _mm256_xor_si256( + _mm256_xor_si256(dst, + _mm256_clmulepi64_epi128(src, mults, 0x00)), + _mm256_clmulepi64_epi128(src, mults, 0x11)); +#endif +} +#define fold_vec256 ADD_SUFFIX(fold_vec256) +#endif /* VL >= 32 */ + +#if VL >= 64 +#undef fold_vec512 +static forceinline ATTRIBUTES __m512i +ADD_SUFFIX(fold_vec512)(__m512i src, __m512i dst, __m512i /* __v8du */ mults) +{ + /* vpternlog with immediate 0x96 is a three-argument XOR. */ + return _mm512_ternarylogic_epi32( + _mm512_clmulepi64_epi128(src, mults, 0x00), + _mm512_clmulepi64_epi128(src, mults, 0x11), + dst, + 0x96); +} +#define fold_vec512 ADD_SUFFIX(fold_vec512) +#endif /* VL >= 64 */ + +#if USE_SSE4_1 /* * Given 'x' containing a 16-byte polynomial, and a pointer 'p' that points to * the next '1 <= len <= 15' data bytes, rearrange the concatenation of 'x' and @@ -154,7 +160,7 @@ ADD_SUFFIX(fold_vec512)(__m512i src, __m512i dst, __m512i multipliers) #undef fold_lessthan16bytes static forceinline ATTRIBUTES __m128i ADD_SUFFIX(fold_lessthan16bytes)(__m128i x, const u8 *p, size_t len, - __m128i /* __v2du */ multipliers_128b) + __m128i /* __v2du */ mults_128b) { /* * pshufb(x, shift_tab[len..len+15]) left shifts x by 16-len bytes. @@ -184,26 +190,31 @@ ADD_SUFFIX(fold_lessthan16bytes)(__m128i x, const u8 *p, size_t len, /* msb 0/1 of each byte selects byte from arg1/2 */ rshift); - return fold_vec128(x0, x1, multipliers_128b); + return fold_vec128(x0, x1, mults_128b); } #define fold_lessthan16bytes ADD_SUFFIX(fold_lessthan16bytes) -#endif /* FOLD_LESSTHAN16BYTES */ +#endif /* USE_SSE4_1 */ -static u32 ATTRIBUTES +static ATTRIBUTES u32 ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len) { - const vec_t multipliers_8v = MULTS_8V; /* 8 vecs */ - const vec_t multipliers_4v = MULTS_4V; /* 4 vecs */ - const vec_t multipliers_2v = MULTS_2V; /* 2 vecs */ - const vec_t multipliers_1v = MULTS_1V; /* 1 vecs */ - const __m128i /* __v2du */ multipliers_128b = - _mm_set_epi64x(CRC32_X95_MODG, CRC32_X159_MODG); - const __m128i /* __v2du */ final_multiplier = - _mm_set_epi64x(0, CRC32_X63_MODG); + /* + * mults_{N}v are the vectors of multipliers for folding across N vec_t + * vectors, i.e. N*VL*8 bits. mults_128b are the two multipliers for + * folding across 128 bits. mults_128b differs from mults_1v when + * VL != 16. All multipliers are 64-bit, to match what pclmulqdq needs, + * but since this is for CRC-32 only their low 32 bits are nonzero. + * For more details, see scripts/gen_crc32_multipliers.c. + */ + const vec_t mults_8v = MULTS_8V; + const vec_t mults_4v = MULTS_4V; + const vec_t mults_2v = MULTS_2V; + const vec_t mults_1v = MULTS_1V; + const __m128i mults_128b = _mm_set_epi64x(CRC32_X95_MODG, CRC32_X159_MODG); + const __m128i final_mult = _mm_set_epi64x(0, CRC32_X63_MODG); const __m128i mask32 = _mm_set_epi32(0, 0, 0, 0xFFFFFFFF); - const __m128i /* __v2du */ barrett_reduction_constants = - _mm_set_epi64x(CRC32_BARRETT_CONSTANT_2, - CRC32_BARRETT_CONSTANT_1); + const __m128i barrett_reduction_constants = + _mm_set_epi64x(CRC32_BARRETT_CONSTANT_2, CRC32_BARRETT_CONSTANT_1); vec_t v0, v1, v2, v3, v4, v5, v6, v7; __m128i x0, x1; @@ -218,50 +229,40 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len) if (len < VL) return crc32_slice1(crc, p, len); - v0 = VXOR(VLOAD_UNALIGNED(p), - M128I_TO_VEC(_mm_cvtsi32_si128(crc))); + v0 = VXOR(VLOADU(p), M128I_TO_VEC(_mm_cvtsi32_si128(crc))); p += VL; if (len >= 4*VL) { - v1 = VLOAD_UNALIGNED(p + 0*VL); - v2 = VLOAD_UNALIGNED(p + 1*VL); - v3 = VLOAD_UNALIGNED(p + 2*VL); + v1 = VLOADU(p + 0*VL); + v2 = VLOADU(p + 1*VL); + v3 = VLOADU(p + 2*VL); p += 3*VL; while (len >= 8*VL) { - v0 = fold_vec(v0, VLOAD_UNALIGNED(p + 0*VL), - multipliers_4v); - v1 = fold_vec(v1, VLOAD_UNALIGNED(p + 1*VL), - multipliers_4v); - v2 = fold_vec(v2, VLOAD_UNALIGNED(p + 2*VL), - multipliers_4v); - v3 = fold_vec(v3, VLOAD_UNALIGNED(p + 3*VL), - multipliers_4v); + v0 = fold_vec(v0, VLOADU(p + 0*VL), mults_4v); + v1 = fold_vec(v1, VLOADU(p + 1*VL), mults_4v); + v2 = fold_vec(v2, VLOADU(p + 2*VL), mults_4v); + v3 = fold_vec(v3, VLOADU(p + 3*VL), mults_4v); p += 4*VL; len -= 4*VL; } - v0 = fold_vec(v0, v2, multipliers_2v); - v1 = fold_vec(v1, v3, multipliers_2v); + v0 = fold_vec(v0, v2, mults_2v); + v1 = fold_vec(v1, v3, mults_2v); if (len & (2*VL)) { - v0 = fold_vec(v0, VLOAD_UNALIGNED(p + 0*VL), - multipliers_2v); - v1 = fold_vec(v1, VLOAD_UNALIGNED(p + 1*VL), - multipliers_2v); + v0 = fold_vec(v0, VLOADU(p + 0*VL), mults_2v); + v1 = fold_vec(v1, VLOADU(p + 1*VL), mults_2v); p += 2*VL; } - v0 = fold_vec(v0, v1, multipliers_1v); + v0 = fold_vec(v0, v1, mults_1v); if (len & VL) { - v0 = fold_vec(v0, VLOAD_UNALIGNED(p), - multipliers_1v); + v0 = fold_vec(v0, VLOADU(p), mults_1v); p += VL; } } else { if (len >= 2*VL) { - v0 = fold_vec(v0, VLOAD_UNALIGNED(p), - multipliers_1v); + v0 = fold_vec(v0, VLOADU(p), mults_1v); p += VL; if (len >= 3*VL) { - v0 = fold_vec(v0, VLOAD_UNALIGNED(p), - multipliers_1v); + v0 = fold_vec(v0, VLOADU(p), mults_1v); p += VL; } } @@ -276,19 +277,19 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len) v0 = VXOR(*vp++, M128I_TO_VEC(_mm_cvtsi32_si128(crc))); } else { len -= align; - #if FOLD_LESSTHAN16BYTES + #if USE_SSE4_1 x0 = _mm_xor_si128(_mm_loadu_si128((const void *)p), _mm_cvtsi32_si128(crc)); p += 16; if (align & 15) { x0 = fold_lessthan16bytes(x0, p, align & 15, - multipliers_128b); + mults_128b); p += align & 15; align &= ~15; } while (align >= 16) { x0 = fold_vec128(x0, *(const __m128i *)p, - multipliers_128b); + mults_128b); p += 16; align -= 16; } @@ -318,14 +319,14 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len) v6 = *vp++; v7 = *vp++; do { - v0 = fold_vec(v0, *vp++, multipliers_8v); - v1 = fold_vec(v1, *vp++, multipliers_8v); - v2 = fold_vec(v2, *vp++, multipliers_8v); - v3 = fold_vec(v3, *vp++, multipliers_8v); - v4 = fold_vec(v4, *vp++, multipliers_8v); - v5 = fold_vec(v5, *vp++, multipliers_8v); - v6 = fold_vec(v6, *vp++, multipliers_8v); - v7 = fold_vec(v7, *vp++, multipliers_8v); + v0 = fold_vec(v0, *vp++, mults_8v); + v1 = fold_vec(v1, *vp++, mults_8v); + v2 = fold_vec(v2, *vp++, mults_8v); + v3 = fold_vec(v3, *vp++, mults_8v); + v4 = fold_vec(v4, *vp++, mults_8v); + v5 = fold_vec(v5, *vp++, mults_8v); + v6 = fold_vec(v6, *vp++, mults_8v); + v7 = fold_vec(v7, *vp++, mults_8v); len -= 8*VL; } while (len >= 16*VL); @@ -333,58 +334,57 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len) * Reduce v0-v7 (length 8*VL bytes) to v0 (length VL bytes) * and fold in any VL-byte data segments that remain. */ - v0 = fold_vec(v0, v4, multipliers_4v); - v1 = fold_vec(v1, v5, multipliers_4v); - v2 = fold_vec(v2, v6, multipliers_4v); - v3 = fold_vec(v3, v7, multipliers_4v); + v0 = fold_vec(v0, v4, mults_4v); + v1 = fold_vec(v1, v5, mults_4v); + v2 = fold_vec(v2, v6, mults_4v); + v3 = fold_vec(v3, v7, mults_4v); if (len & (4*VL)) { - v0 = fold_vec(v0, *vp++, multipliers_4v); - v1 = fold_vec(v1, *vp++, multipliers_4v); - v2 = fold_vec(v2, *vp++, multipliers_4v); - v3 = fold_vec(v3, *vp++, multipliers_4v); + v0 = fold_vec(v0, *vp++, mults_4v); + v1 = fold_vec(v1, *vp++, mults_4v); + v2 = fold_vec(v2, *vp++, mults_4v); + v3 = fold_vec(v3, *vp++, mults_4v); } - v0 = fold_vec(v0, v2, multipliers_2v); - v1 = fold_vec(v1, v3, multipliers_2v); + v0 = fold_vec(v0, v2, mults_2v); + v1 = fold_vec(v1, v3, mults_2v); if (len & (2*VL)) { - v0 = fold_vec(v0, *vp++, multipliers_2v); - v1 = fold_vec(v1, *vp++, multipliers_2v); + v0 = fold_vec(v0, *vp++, mults_2v); + v1 = fold_vec(v1, *vp++, mults_2v); } - v0 = fold_vec(v0, v1, multipliers_1v); + v0 = fold_vec(v0, v1, mults_1v); if (len & VL) - v0 = fold_vec(v0, *vp++, multipliers_1v); + v0 = fold_vec(v0, *vp++, mults_1v); p = (const u8 *)vp; } /* - * Reduce v0 (length VL bytes) to x0 (length 16 bytes) - * and fold in any 16-byte data segments that remain. + * Fewer than VL bytes remain. Reduce v0 (length VL bytes) to x0 + * (length 16 bytes) and fold in any 16-byte data segments that remain. */ #if VL == 16 x0 = v0; #else { -# if VL == 32 + #if VL == 32 __m256i y0 = v0; -# else - const __m256i multipliers_256b = + #else + const __m256i mults_256b = _mm256_set_epi64x(CRC32_X223_MODG, CRC32_X287_MODG, CRC32_X223_MODG, CRC32_X287_MODG); __m256i y0 = fold_vec256(_mm512_extracti64x4_epi64(v0, 0), _mm512_extracti64x4_epi64(v0, 1), - multipliers_256b); + mults_256b); if (len & 32) { y0 = fold_vec256(y0, _mm256_loadu_si256((const void *)p), - multipliers_256b); + mults_256b); p += 32; } -# endif + #endif x0 = fold_vec128(_mm256_extracti128_si256(y0, 0), - _mm256_extracti128_si256(y0, 1), - multipliers_128b); + _mm256_extracti128_si256(y0, 1), mults_128b); } if (len & 16) { x0 = fold_vec128(x0, _mm_loadu_si128((const void *)p), - multipliers_128b); + mults_128b); p += 16; } #endif @@ -394,9 +394,9 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len) * If fold_lessthan16bytes() is available, handle any remainder * of 1 to 15 bytes now, before reducing to 32 bits. */ -#if FOLD_LESSTHAN16BYTES +#if USE_SSE4_1 if (len) - x0 = fold_lessthan16bytes(x0, p, len, multipliers_128b); + x0 = fold_lessthan16bytes(x0, p, len, mults_128b); #endif /* @@ -405,12 +405,12 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len) * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x). */ x0 = _mm_xor_si128(_mm_srli_si128(x0, 8), - _mm_clmulepi64_si128(x0, multipliers_128b, 0x10)); + _mm_clmulepi64_si128(x0, mults_128b, 0x10)); /* Fold 96 => 64 bits. */ x0 = _mm_xor_si128(_mm_srli_si128(x0, 4), _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), - final_multiplier, 0x00)); + final_mult, 0x00)); /* * Reduce 64 => 32 bits using Barrett reduction. @@ -459,7 +459,7 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len) x1 = _mm_clmulepi64_si128(_mm_and_si128(x1, mask32), barrett_reduction_constants, 0x10); x0 = _mm_xor_si128(x0, x1); -#if FOLD_LESSTHAN16BYTES +#if USE_SSE4_1 crc = _mm_extract_epi32(x0, 1); #else crc = _mm_cvtsi128_si32(_mm_shuffle_epi32(x0, 0x01)); @@ -471,7 +471,7 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len) #undef vec_t #undef fold_vec -#undef VLOAD_UNALIGNED +#undef VLOADU #undef VXOR #undef M128I_TO_VEC #undef MULTS @@ -483,5 +483,5 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len) #undef SUFFIX #undef ATTRIBUTES #undef VL -#undef FOLD_LESSTHAN16BYTES -#undef USE_TERNARYLOGIC +#undef USE_SSE4_1 +#undef USE_AVX512