diff --git a/CMakeLists.txt b/CMakeLists.txt index 917fefb0..f902c675 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -131,6 +131,7 @@ if(LIBDEFLATE_ZLIB_SUPPORT) lib/adler32_vec_template.h lib/arm/adler32_impl.h lib/x86/adler32_impl.h + lib/x86/adler32_template.h lib/zlib_constants.h ) if(LIBDEFLATE_COMPRESSION_SUPPORT) diff --git a/lib/x86/adler32_impl.h b/lib/x86/adler32_impl.h index c8b5153b..f0c68ace 100644 --- a/lib/x86/adler32_impl.h +++ b/lib/x86/adler32_impl.h @@ -30,371 +30,67 @@ #include "cpu_features.h" -/* - * The following macros horizontally sum the s1 counters and add them to the - * real s1, and likewise for s2. They do this via a series of reductions, each - * of which halves the vector length, until just one counter remains. - * - * The s1 reductions don't depend on the s2 reductions and vice versa, so for - * efficiency they are interleaved. - * - * If used_sad=1, then the bytes were summed using one of the Sum of Absolute - * Differences instructions (psadbw or vpsadbw) before they were added to v_s1. - * In this case every other counter in v_s1 is 0, so we skip one of the s1 - * reductions when going from 128 => 32 bits. - */ - -#define ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2, used_sad) \ -{ \ - __m128i /* __v4su */ s1_last = (v_s1), s2_last = (v_s2); \ - \ - /* 128 => 32 bits */ \ - if (!(used_sad)) \ - s1_last = _mm_add_epi32(s1_last, \ - _mm_shuffle_epi32(s1_last, 0x31)); \ - s2_last = _mm_add_epi32(s2_last, _mm_shuffle_epi32(s2_last, 0x31)); \ - s1_last = _mm_add_epi32(s1_last, _mm_shuffle_epi32(s1_last, 0x02)); \ - s2_last = _mm_add_epi32(s2_last, _mm_shuffle_epi32(s2_last, 0x02)); \ - \ - *(s1) += (u32)_mm_cvtsi128_si32(s1_last); \ - *(s2) += (u32)_mm_cvtsi128_si32(s2_last); \ -} - -#define ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2, used_sad) \ -{ \ - __m128i /* __v4su */ s1_128bit, s2_128bit; \ - \ - /* 256 => 128 bits */ \ - s1_128bit = _mm_add_epi32(_mm256_extracti128_si256((v_s1), 0), \ - _mm256_extracti128_si256((v_s1), 1)); \ - s2_128bit = _mm_add_epi32(_mm256_extracti128_si256((v_s2), 0), \ - _mm256_extracti128_si256((v_s2), 1)); \ - \ - ADLER32_FINISH_VEC_CHUNK_128((s1), (s2), s1_128bit, s2_128bit, \ - (used_sad)); \ -} - -#define ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2, used_sad) \ -{ \ - __m256i /* __v8su */ s1_256bit, s2_256bit; \ - \ - /* 512 => 256 bits */ \ - s1_256bit = _mm256_add_epi32(_mm512_extracti64x4_epi64((v_s1), 0), \ - _mm512_extracti64x4_epi64((v_s1), 1)); \ - s2_256bit = _mm256_add_epi32(_mm512_extracti64x4_epi64((v_s2), 0), \ - _mm512_extracti64x4_epi64((v_s2), 1)); \ - \ - ADLER32_FINISH_VEC_CHUNK_256((s1), (s2), s1_256bit, s2_256bit, \ - (used_sad)); \ -} - -/* - * This is a very silly partial workaround for gcc bug - * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107892. The bug causes gcc to - * generate extra move instructions in some loops containing vector intrinsics. - * - * An alternate workaround would be to use gcc native vector operations instead - * of vector intrinsics. But that would result in MSVC needing its own code. - */ -#if GCC_PREREQ(1, 0) -# define GCC_UPDATE_VARS(a, b, c, d, e, f) \ - __asm__("" : "+x" (a), "+x" (b), "+x" (c), "+x" (d), "+x" (e), "+x" (f)) -#else -# define GCC_UPDATE_VARS(a, b, c, d, e, f) \ - (void)a, (void)b, (void)c, (void)d, (void)e, (void)f -#endif - -/* - * SSE2 and AVX2 implementations. They are very similar; the AVX2 - * implementation just uses twice the vector width as the SSE2 one. - */ +/* SSE2 and AVX2 implementations. Used on older CPUs. */ #if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) -# define adler32_sse2 adler32_sse2 -# define FUNCNAME adler32_sse2 -# define FUNCNAME_CHUNK adler32_sse2_chunk -# define IMPL_ALIGNMENT 16 -# define IMPL_SEGMENT_LEN 32 -/* - * The 16-bit precision byte counters must not be allowed to undergo *signed* - * overflow, otherwise the signed multiplications at the end (_mm_madd_epi16) - * would behave incorrectly. - */ -# define IMPL_MAX_CHUNK_LEN (32 * (0x7FFF / 0xFF)) -# define ATTRIBUTES _target_attribute("sse2") -static forceinline ATTRIBUTES void -adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2) -{ - static const u16 _aligned_attribute(16) mults[4][8] = { - { 32, 31, 30, 29, 28, 27, 26, 25 }, - { 24, 23, 22, 21, 20, 19, 18, 17 }, - { 16, 15, 14, 13, 12, 11, 10, 9 }, - { 8, 7, 6, 5, 4, 3, 2, 1 }, - }; - const __m128i /* __v8hu */ mults_a = _mm_load_si128((const __m128i *)mults[0]); - const __m128i /* __v8hu */ mults_b = _mm_load_si128((const __m128i *)mults[1]); - const __m128i /* __v8hu */ mults_c = _mm_load_si128((const __m128i *)mults[2]); - const __m128i /* __v8hu */ mults_d = _mm_load_si128((const __m128i *)mults[3]); - const __m128i zeroes = _mm_setzero_si128(); - - /* s1 counters: 32-bit, sum of bytes */ - __m128i /* __v4su */ v_s1 = zeroes; - - /* s2 counters: 32-bit, sum of s1 values */ - __m128i /* __v4su */ v_s2 = zeroes; - - /* - * Thirty-two 16-bit counters for byte sums. Each accumulates the bytes - * that eventually need to be multiplied by a number 32...1 for addition - * into s2. - */ - __m128i /* __v8hu */ v_byte_sums_a = zeroes; - __m128i /* __v8hu */ v_byte_sums_b = zeroes; - __m128i /* __v8hu */ v_byte_sums_c = zeroes; - __m128i /* __v8hu */ v_byte_sums_d = zeroes; - - do { - /* Load the next 32 bytes. */ - const __m128i bytes1 = *p++; - const __m128i bytes2 = *p++; - - /* - * Accumulate the previous s1 counters into the s2 counters. - * Logically, this really should be v_s2 += v_s1 * 32, but we - * can do the multiplication (or left shift) later. - */ - v_s2 = _mm_add_epi32(v_s2, v_s1); - - /* - * s1 update: use "Packed Sum of Absolute Differences" to add - * the bytes horizontally with 8 bytes per sum. Then add the - * sums to the s1 counters. - */ - v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes1, zeroes)); - v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes2, zeroes)); - - /* - * Also accumulate the bytes into 32 separate counters that have - * 16-bit precision. - */ - v_byte_sums_a = _mm_add_epi16( - v_byte_sums_a, _mm_unpacklo_epi8(bytes1, zeroes)); - v_byte_sums_b = _mm_add_epi16( - v_byte_sums_b, _mm_unpackhi_epi8(bytes1, zeroes)); - v_byte_sums_c = _mm_add_epi16( - v_byte_sums_c, _mm_unpacklo_epi8(bytes2, zeroes)); - v_byte_sums_d = _mm_add_epi16( - v_byte_sums_d, _mm_unpackhi_epi8(bytes2, zeroes)); - - GCC_UPDATE_VARS(v_s1, v_s2, v_byte_sums_a, v_byte_sums_b, - v_byte_sums_c, v_byte_sums_d); - } while (p != end); - - /* Finish calculating the s2 counters. */ - v_s2 = _mm_slli_epi32(v_s2, 5); - v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(v_byte_sums_a, mults_a)); - v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(v_byte_sums_b, mults_b)); - v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(v_byte_sums_c, mults_c)); - v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(v_byte_sums_d, mults_d)); - - /* Add the counters to the real s1 and s2. */ - ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2, 1); -} -# include "../adler32_vec_template.h" - -# define adler32_avx2 adler32_avx2 -# define FUNCNAME adler32_avx2 -# define FUNCNAME_CHUNK adler32_avx2_chunk -# define IMPL_ALIGNMENT 32 -# define IMPL_SEGMENT_LEN 64 -# define IMPL_MAX_CHUNK_LEN (64 * (0x7FFF / 0xFF)) -# define ATTRIBUTES _target_attribute("avx2") -static forceinline ATTRIBUTES void -adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2) -{ - /* - * Note, the multipliers have to be in this order because - * _mm256_unpack{lo,hi}_epi8 work on each 128-bit lane separately. - */ - static const u16 _aligned_attribute(32) mults[4][16] = { - { 64, 63, 62, 61, 60, 59, 58, 57, 48, 47, 46, 45, 44, 43, 42, 41 }, - { 56, 55, 54, 53, 52, 51, 50, 49, 40, 39, 38, 37, 36, 35, 34, 33 }, - { 32, 31, 30, 29, 28, 27, 26, 25, 16, 15, 14, 13, 12, 11, 10, 9 }, - { 24, 23, 22, 21, 20, 19, 18, 17, 8, 7, 6, 5, 4, 3, 2, 1 }, - }; - const __m256i /* __v16hu */ mults_a = _mm256_load_si256((const __m256i *)mults[0]); - const __m256i /* __v16hu */ mults_b = _mm256_load_si256((const __m256i *)mults[1]); - const __m256i /* __v16hu */ mults_c = _mm256_load_si256((const __m256i *)mults[2]); - const __m256i /* __v16hu */ mults_d = _mm256_load_si256((const __m256i *)mults[3]); - const __m256i zeroes = _mm256_setzero_si256(); - __m256i /* __v8su */ v_s1 = zeroes; - __m256i /* __v8su */ v_s2 = zeroes; - __m256i /* __v16hu */ v_byte_sums_a = zeroes; - __m256i /* __v16hu */ v_byte_sums_b = zeroes; - __m256i /* __v16hu */ v_byte_sums_c = zeroes; - __m256i /* __v16hu */ v_byte_sums_d = zeroes; - - do { - const __m256i bytes1 = *p++; - const __m256i bytes2 = *p++; - - v_s2 = _mm256_add_epi32(v_s2, v_s1); - v_s1 = _mm256_add_epi32(v_s1, _mm256_sad_epu8(bytes1, zeroes)); - v_s1 = _mm256_add_epi32(v_s1, _mm256_sad_epu8(bytes2, zeroes)); - v_byte_sums_a = _mm256_add_epi16( - v_byte_sums_a, _mm256_unpacklo_epi8(bytes1, zeroes)); - v_byte_sums_b = _mm256_add_epi16( - v_byte_sums_b, _mm256_unpackhi_epi8(bytes1, zeroes)); - v_byte_sums_c = _mm256_add_epi16( - v_byte_sums_c, _mm256_unpacklo_epi8(bytes2, zeroes)); - v_byte_sums_d = _mm256_add_epi16( - v_byte_sums_d, _mm256_unpackhi_epi8(bytes2, zeroes)); - - GCC_UPDATE_VARS(v_s1, v_s2, v_byte_sums_a, v_byte_sums_b, - v_byte_sums_c, v_byte_sums_d); - } while (p != end); - - v_s2 = _mm256_slli_epi32(v_s2, 6); - v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(v_byte_sums_a, mults_a)); - v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(v_byte_sums_b, mults_b)); - v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(v_byte_sums_c, mults_c)); - v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(v_byte_sums_d, mults_d)); - ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2, 1); -} -# include "../adler32_vec_template.h" +# define adler32_x86_sse2 adler32_x86_sse2 +# define SUFFIX _x86_sse2 +# define ATTRIBUTES _target_attribute("sse2") +# define VL 16 +# define USE_VNNI 0 +# define USE_MASKING 0 +# include "adler32_template.h" + +# define adler32_x86_avx2 adler32_x86_avx2 +# define SUFFIX _x86_avx2 +# define ATTRIBUTES _target_attribute("avx2") +# define VL 32 +# define USE_VNNI 0 +# define USE_MASKING 0 +# include "adler32_template.h" #endif /* - * AVX2/AVX-VNNI implementation. This is similar to the AVX512BW/AVX512VNNI - * implementation, but instead of using AVX-512 it uses AVX2 plus AVX-VNNI. - * AVX-VNNI adds dot product instructions to CPUs without AVX-512. + * AVX-VNNI implementation. This is used on CPUs that have AVX2 and AVX-VNNI + * but don't have AVX-512, for example Intel Alder Lake. */ #if GCC_PREREQ(11, 1) || CLANG_PREREQ(12, 0, 13000000) || MSVC_PREREQ(1930) -# define adler32_avx2_vnni adler32_avx2_vnni -# define FUNCNAME adler32_avx2_vnni -# define FUNCNAME_CHUNK adler32_avx2_vnni_chunk -# define IMPL_ALIGNMENT 32 -# define IMPL_SEGMENT_LEN 128 -# define IMPL_MAX_CHUNK_LEN MAX_CHUNK_LEN +# define adler32_x86_avx2_vnni adler32_x86_avx2_vnni +# define SUFFIX _x86_avx2_vnni # define ATTRIBUTES _target_attribute("avx2,avxvnni") -static forceinline ATTRIBUTES void -adler32_avx2_vnni_chunk(const __m256i *p, const __m256i *const end, - u32 *s1, u32 *s2) -{ - static const u8 _aligned_attribute(32) mults[2][32] = { - { 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, - 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, }, - { 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, - 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, }, - }; - const __m256i /* __v32qu */ mults_a = _mm256_load_si256((const __m256i *)mults[0]); - const __m256i /* __v32qu */ mults_b = _mm256_load_si256((const __m256i *)mults[1]); - const __m256i zeroes = _mm256_setzero_si256(); - __m256i /* __v8su */ v_s1_a = zeroes; - __m256i /* __v8su */ v_s1_b = zeroes; - __m256i /* __v8su */ v_s1_sums_a = zeroes; - __m256i /* __v8su */ v_s1_sums_b = zeroes; - __m256i /* __v8su */ v_s2_a = zeroes; - __m256i /* __v8su */ v_s2_b = zeroes; - __m256i /* __v8su */ v_s2_c = zeroes; - __m256i /* __v8su */ v_s2_d = zeroes; - - do { - const __m256i bytes_a = *p++; - const __m256i bytes_b = *p++; - const __m256i bytes_c = *p++; - const __m256i bytes_d = *p++; - - v_s2_a = _mm256_dpbusd_avx_epi32(v_s2_a, bytes_a, mults_a); - v_s2_b = _mm256_dpbusd_avx_epi32(v_s2_b, bytes_b, mults_b); - v_s2_c = _mm256_dpbusd_avx_epi32(v_s2_c, bytes_c, mults_a); - v_s2_d = _mm256_dpbusd_avx_epi32(v_s2_d, bytes_d, mults_b); - v_s1_sums_a = _mm256_add_epi32(v_s1_sums_a, v_s1_a); - v_s1_sums_b = _mm256_add_epi32(v_s1_sums_b, v_s1_b); - v_s1_a = _mm256_add_epi32(v_s1_a, - _mm256_add_epi32(_mm256_sad_epu8(bytes_a, zeroes), - _mm256_sad_epu8(bytes_b, zeroes))); - v_s1_b = _mm256_add_epi32(v_s1_b, - _mm256_add_epi32(_mm256_sad_epu8(bytes_c, zeroes), - _mm256_sad_epu8(bytes_d, zeroes))); -#if GCC_PREREQ(1, 0) - __asm__("" : "+x" (v_s1_a), "+x" (v_s1_b), "+x" (v_s1_sums_a), - "+x" (v_s1_sums_b), "+x" (v_s2_a), "+x" (v_s2_b), - "+x" (v_s2_c), "+x" (v_s2_d)); +# define VL 32 +# define USE_VNNI 1 +# define USE_MASKING 0 +# include "adler32_template.h" #endif - } while (p != end); - v_s1_sums_a = _mm256_add_epi32(v_s1_sums_a, v_s1_sums_b); - v_s1_sums_a = _mm256_slli_epi32(v_s1_sums_a, 7); - v_s1_sums_a = _mm256_add_epi32(v_s1_sums_a, _mm256_slli_epi32(v_s1_a, 6)); - v_s1_a = _mm256_add_epi32(v_s1_a, v_s1_b); - v_s2_a = _mm256_add_epi32(v_s2_a, v_s2_b); - v_s2_c = _mm256_add_epi32(v_s2_c, v_s2_d); - v_s2_a = _mm256_add_epi32(v_s2_a, v_s2_c); - v_s2_a = _mm256_add_epi32(v_s2_a, v_s1_sums_a); - ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1_a, v_s2_a, 1); -} -# include "../adler32_vec_template.h" -#endif +#if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920) +/* + * AVX512VNNI implementation using 256-bit vectors. This is very similar to the + * AVX-VNNI implementation but takes advantage of masking and more registers. + * This is used on CPUs that support AVX-512 but where using 512-bit vectors + * causes downclocking. This should also be the optimal implementation for CPUs + * that support AVX10/256 but not AVX10/512. + */ +# define adler32_x86_avx512_vl256_vnni adler32_x86_avx512_vl256_vnni +# define SUFFIX _x86_avx512_vl256_vnni +# define ATTRIBUTES _target_attribute("avx512bw,avx512vl,avx512vnni") +# define VL 32 +# define USE_VNNI 1 +# define USE_MASKING 1 +# include "adler32_template.h" /* - * AVX512BW/AVX512VNNI implementation. Uses the vpdpbusd (dot product) - * instruction from AVX512VNNI. + * AVX512VNNI implementation using 512-bit vectors. This is used on CPUs that + * have a good AVX-512 implementation including AVX512VNNI. This should also be + * the optimal implementation for CPUs that support AVX10/512. */ -#if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920) -# define adler32_avx512_vnni adler32_avx512_vnni -# define FUNCNAME adler32_avx512_vnni -# define FUNCNAME_CHUNK adler32_avx512_vnni_chunk -# define IMPL_ALIGNMENT 64 -# define IMPL_SEGMENT_LEN 128 -# define IMPL_MAX_CHUNK_LEN MAX_CHUNK_LEN +# define adler32_x86_avx512_vl512_vnni adler32_x86_avx512_vl512_vnni +# define SUFFIX _x86_avx512_vl512_vnni # define ATTRIBUTES _target_attribute("avx512bw,avx512vnni") -static forceinline ATTRIBUTES void -adler32_avx512_vnni_chunk(const __m512i *p, const __m512i *const end, - u32 *s1, u32 *s2) -{ - static const u8 _aligned_attribute(64) mults[64] = { - 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, - 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, - 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, - 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, - }; - const __m512i /* __v64qu */ mults_a = _mm512_load_si512((const __m512i *)mults); - const __m512i /* __v64qu */ ones = _mm512_set1_epi8(1); - const __m512i zeroes = _mm512_setzero_si512(); - __m512i /* __v16su */ v_s1_a = zeroes; - __m512i /* __v16su */ v_s1_b = zeroes; - __m512i /* __v16su */ v_s1_sums_a = zeroes; - __m512i /* __v16su */ v_s1_sums_b = zeroes; - __m512i /* __v16su */ v_s2_a = zeroes; - __m512i /* __v16su */ v_s2_b = zeroes; - - do { - const __m512i bytes_a = *p++; - const __m512i bytes_b = *p++; - - v_s2_a = _mm512_dpbusd_epi32(v_s2_a, bytes_a, mults_a); - v_s2_b = _mm512_dpbusd_epi32(v_s2_b, bytes_b, mults_a); - v_s1_sums_a = _mm512_add_epi32(v_s1_sums_a, v_s1_a); - v_s1_sums_b = _mm512_add_epi32(v_s1_sums_b, v_s1_b); - /* - * vpdpbusd with 1's seems to be faster than vpsadbw + vpaddd - * here, provided that the accumulators are independent. - */ - v_s1_a = _mm512_dpbusd_epi32(v_s1_a, bytes_a, ones); - v_s1_b = _mm512_dpbusd_epi32(v_s1_b, bytes_b, ones); - GCC_UPDATE_VARS(v_s1_a, v_s1_b, v_s1_sums_a, v_s1_sums_b, - v_s2_a, v_s2_b); - } while (p != end); - - v_s1_sums_a = _mm512_add_epi32(v_s1_sums_a, v_s1_sums_b); - v_s1_sums_a = _mm512_slli_epi32(v_s1_sums_a, 7); - v_s1_sums_a = _mm512_add_epi32(v_s1_sums_a, _mm512_slli_epi32(v_s1_a, 6)); - v_s1_a = _mm512_add_epi32(v_s1_a, v_s1_b); - v_s2_a = _mm512_add_epi32(v_s2_a, v_s2_b); - v_s2_a = _mm512_add_epi32(v_s2_a, v_s1_sums_a); - ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1_a, v_s2_a, 0); -} -# include "../adler32_vec_template.h" +# define VL 64 +# define USE_VNNI 1 +# define USE_MASKING 1 +# include "adler32_template.h" #endif static inline adler32_func_t @@ -402,22 +98,27 @@ arch_select_adler32_func(void) { const u32 features MAYBE_UNUSED = get_x86_cpu_features(); -#ifdef adler32_avx512_vnni +#ifdef adler32_x86_avx512_vl512_vnni if ((features & X86_CPU_FEATURE_ZMM) && HAVE_AVX512BW(features) && HAVE_AVX512VNNI(features)) - return adler32_avx512_vnni; + return adler32_x86_avx512_vl512_vnni; +#endif +#ifdef adler32_x86_avx512_vl256_vnni + if (HAVE_AVX512BW(features) && HAVE_AVX512VL(features) && + HAVE_AVX512VNNI(features)) + return adler32_x86_avx512_vl256_vnni; #endif -#ifdef adler32_avx2_vnni +#ifdef adler32_x86_avx2_vnni if (HAVE_AVX2(features) && HAVE_AVXVNNI(features)) - return adler32_avx2_vnni; + return adler32_x86_avx2_vnni; #endif -#ifdef adler32_avx2 +#ifdef adler32_x86_avx2 if (HAVE_AVX2(features)) - return adler32_avx2; + return adler32_x86_avx2; #endif -#ifdef adler32_sse2 +#ifdef adler32_x86_sse2 if (HAVE_SSE2(features)) - return adler32_sse2; + return adler32_x86_sse2; #endif return NULL; } diff --git a/lib/x86/adler32_template.h b/lib/x86/adler32_template.h new file mode 100644 index 00000000..9a3146b8 --- /dev/null +++ b/lib/x86/adler32_template.h @@ -0,0 +1,515 @@ +/* + * x86/adler32_template.h - template for vectorized Adler-32 implementations + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * This file is a "template" for instantiating Adler-32 functions for x86. + * The "parameters" are: + * + * SUFFIX: + * Name suffix to append to all instantiated functions. + * ATTRIBUTES: + * Target function attributes to use. Must satisfy the dependencies of the + * other parameters as follows: + * VL=16 && USE_VNNI=0 && USE_MASKING=0: at least sse2 + * VL=32 && USE_VNNI=0 && USE_MASKING=0: at least avx2 + * VL=32 && USE_VNNI=1 && USE_MASKING=0: at least avx2,avxvnni + * VL=32 && USE_VNNI=1 && USE_MASKING=1: at least avx512bw,avx512vl,avx512vnni + * VL=64 && USE_VNNI=1 && USE_MASKING=1: at least avx512bw,avx512vnni + * (Other combinations are not useful and have not been tested.) + * VL: + * Vector length in bytes. Must be 16, 32, and 64. + * USE_VNNI: + * If 1, use the VNNI dot product based algorithm. + * If 0, use the legacy SSE2 and AVX2 compatible algorithm. + * USE_MASKING: + * If 1, use AVX-512 features such as masking. + * If 0, assume that the CPU might not support AVX-512. + */ + +#if VL == 16 +# define vec_t __m128i +# define mask_t u16 +# define LOG2_VL 4 +# define VADD8(a, b) _mm_add_epi8((a), (b)) +# define VADD16(a, b) _mm_add_epi16((a), (b)) +# define VADD32(a, b) _mm_add_epi32((a), (b)) +# if USE_MASKING +# define VDPBUSD(a, b, c) _mm_dpbusd_epi32((a), (b), (c)) +# else +# define VDPBUSD(a, b, c) _mm_dpbusd_avx_epi32((a), (b), (c)) +# endif +# define VLOAD(p) _mm_load_si128((const void *)(p)) +# define VLOADU(p) _mm_loadu_si128((const void *)(p)) +# define VMADD16(a, b) _mm_madd_epi16((a), (b)) +# define VMASKZ_LOADU(mask, p) _mm_maskz_loadu_epi8((mask), (p)) +# define VMULLO32(a, b) _mm_mullo_epi32((a), (b)) +# define VSAD8(a, b) _mm_sad_epu8((a), (b)) +# define VSET1_32(a) _mm_set1_epi32(a) +# define VSET1_8(a) _mm_set1_epi8(a) +# define VSETZERO() _mm_setzero_si128() +# define VSLL32(a, b) _mm_slli_epi32((a), (b)) +# define VUNPACKHI8(a, b) _mm_unpackhi_epi8((a), (b)) +# define VUNPACKLO8(a, b) _mm_unpacklo_epi8((a), (b)) +#elif VL == 32 +# define vec_t __m256i +# define mask_t u32 +# define LOG2_VL 5 +# define VADD8(a, b) _mm256_add_epi8((a), (b)) +# define VADD16(a, b) _mm256_add_epi16((a), (b)) +# define VADD32(a, b) _mm256_add_epi32((a), (b)) +# if USE_MASKING +# define VDPBUSD(a, b, c) _mm256_dpbusd_epi32((a), (b), (c)) +# else +# define VDPBUSD(a, b, c) _mm256_dpbusd_avx_epi32((a), (b), (c)) +# endif +# define VLOAD(p) _mm256_load_si256((const void *)(p)) +# define VLOADU(p) _mm256_loadu_si256((const void *)(p)) +# define VMADD16(a, b) _mm256_madd_epi16((a), (b)) +# define VMASKZ_LOADU(mask, p) _mm256_maskz_loadu_epi8((mask), (p)) +# define VMULLO32(a, b) _mm256_mullo_epi32((a), (b)) +# define VSAD8(a, b) _mm256_sad_epu8((a), (b)) +# define VSET1_32(a) _mm256_set1_epi32(a) +# define VSET1_8(a) _mm256_set1_epi8(a) +# define VSETZERO() _mm256_setzero_si256() +# define VSLL32(a, b) _mm256_slli_epi32((a), (b)) +# define VUNPACKHI8(a, b) _mm256_unpackhi_epi8((a), (b)) +# define VUNPACKLO8(a, b) _mm256_unpacklo_epi8((a), (b)) +#elif VL == 64 +# define vec_t __m512i +# define mask_t u64 +# define LOG2_VL 6 +# define VADD8(a, b) _mm512_add_epi8((a), (b)) +# define VADD32(a, b) _mm512_add_epi32((a), (b)) +# define VDPBUSD(a, b, c) _mm512_dpbusd_epi32((a), (b), (c)) +# define VLOAD(p) _mm512_load_si512((const void *)(p)) +# define VLOADU(p) _mm512_loadu_si512((const void *)(p)) +# define VMASKZ_LOADU(mask, p) _mm512_maskz_loadu_epi8((mask), (p)) +# define VMULLO32(a, b) _mm512_mullo_epi32((a), (b)) +# define VSET1_32(a) _mm512_set1_epi32(a) +# define VSET1_8(a) _mm512_set1_epi8(a) +# define VSETZERO() _mm512_setzero_si512() +# define VSLL32(a, b) _mm512_slli_epi32((a), (b)) +#else +# error "unsupported vector length" +#endif + +#define VADD32_3X(a, b, c) VADD32(VADD32((a), (b)), (c)) +#define VADD32_4X(a, b, c, d) VADD32(VADD32((a), (b)), VADD32((c), (d))) +#define VADD32_5X(a, b, c, d, e) VADD32((a), VADD32_4X((b), (c), (d), (e))) +#define VADD32_7X(a, b, c, d, e, f, g) \ + VADD32(VADD32_3X((a), (b), (c)), VADD32_4X((d), (e), (f), (g))) + +/* Sum the 32-bit elements of v_s1 and add them to s1, and likewise for s2. */ +#undef reduce_to_32bits +static forceinline ATTRIBUTES void +ADD_SUFFIX(reduce_to_32bits)(vec_t v_s1, vec_t v_s2, u32 *s1_p, u32 *s2_p) +{ + __m128i v_s1_128, v_s2_128; +#if VL == 16 + { + v_s1_128 = v_s1; + v_s2_128 = v_s2; + } +#else + { + __m256i v_s1_256, v_s2_256; +# if VL == 32 + v_s1_256 = v_s1; + v_s2_256 = v_s2; +# else + /* Reduce 512 bits to 256 bits. */ + v_s1_256 = _mm256_add_epi32(_mm512_extracti64x4_epi64(v_s1, 0), + _mm512_extracti64x4_epi64(v_s1, 1)); + v_s2_256 = _mm256_add_epi32(_mm512_extracti64x4_epi64(v_s2, 0), + _mm512_extracti64x4_epi64(v_s2, 1)); +# endif + /* Reduce 256 bits to 128 bits. */ + v_s1_128 = _mm_add_epi32(_mm256_extracti128_si256(v_s1_256, 0), + _mm256_extracti128_si256(v_s1_256, 1)); + v_s2_128 = _mm_add_epi32(_mm256_extracti128_si256(v_s2_256, 0), + _mm256_extracti128_si256(v_s2_256, 1)); + } +#endif + + /* + * Reduce 128 bits to 32 bits. + * + * If the bytes were summed into v_s1 using psadbw + paddd, then ignore + * the odd-indexed elements of v_s1_128 since they are zero. + */ +#if USE_VNNI + v_s1_128 = _mm_add_epi32(v_s1_128, _mm_shuffle_epi32(v_s1_128, 0x31)); +#endif + v_s2_128 = _mm_add_epi32(v_s2_128, _mm_shuffle_epi32(v_s2_128, 0x31)); + v_s1_128 = _mm_add_epi32(v_s1_128, _mm_shuffle_epi32(v_s1_128, 0x02)); + v_s2_128 = _mm_add_epi32(v_s2_128, _mm_shuffle_epi32(v_s2_128, 0x02)); + + *s1_p += (u32)_mm_cvtsi128_si32(v_s1_128); + *s2_p += (u32)_mm_cvtsi128_si32(v_s2_128); +} +#define reduce_to_32bits ADD_SUFFIX(reduce_to_32bits) + +static u32 ATTRIBUTES +ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len) +{ +#if USE_VNNI + /* This contains the bytes [VL, VL-1, VL-2, ..., 1]. */ + static const u8 _aligned_attribute(VL) raw_mults[VL] = { + #if VL == 64 + 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, + 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, + #endif + #if VL >= 32 + 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, + #endif + 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, + }; + const vec_t ones = VSET1_8(1); +#else + /* + * This contains the 16-bit values [2*VL, 2*VL - 1, 2*VL - 2, ..., 1]. + * For VL==32 the ordering is weird because it has to match the way that + * vpunpcklbw and vpunpckhbw work on 128-bit lanes separately. + */ + static const u16 _aligned_attribute(VL) raw_mults[4][VL / 2] = { + #if VL == 16 + { 32, 31, 30, 29, 28, 27, 26, 25 }, + { 24, 23, 22, 21, 20, 19, 18, 17 }, + { 16, 15, 14, 13, 12, 11, 10, 9 }, + { 8, 7, 6, 5, 4, 3, 2, 1 }, + #elif VL == 32 + { 64, 63, 62, 61, 60, 59, 58, 57, 48, 47, 46, 45, 44, 43, 42, 41 }, + { 56, 55, 54, 53, 52, 51, 50, 49, 40, 39, 38, 37, 36, 35, 34, 33 }, + { 32, 31, 30, 29, 28, 27, 26, 25, 16, 15, 14, 13, 12, 11, 10, 9 }, + { 24, 23, 22, 21, 20, 19, 18, 17, 8, 7, 6, 5, 4, 3, 2, 1 }, + #else + # error "unsupported parameters" + #endif + }; + const vec_t mults_a = VLOAD(raw_mults[0]); + const vec_t mults_b = VLOAD(raw_mults[1]); + const vec_t mults_c = VLOAD(raw_mults[2]); + const vec_t mults_d = VLOAD(raw_mults[3]); +#endif + const vec_t zeroes = VSETZERO(); + u32 s1 = adler & 0xFFFF; + u32 s2 = adler >> 16; + + /* + * If the length is large and the pointer is misaligned, align it. + * For smaller lengths, just take the unaligned load penalty. + */ + if (unlikely(len > 65536 && ((uintptr_t)p & (VL-1)))) { + do { + s1 += *p++; + s2 += s1; + len--; + } while ((uintptr_t)p & (VL-1)); + s1 %= DIVISOR; + s2 %= DIVISOR; + } + +#if USE_VNNI + /* + * This is Adler-32 using the vpdpbusd instruction from AVX512-VNNI or + * AVX-VNNI. vpdpbusd multiplies the unsigned bytes of one vector by + * the signed bytes of another vector and adds the sums in groups of 4 + * to the 32-bit elements of a third vector. We use it in two ways: + * multiplying the data bytes by a sequence like 64,63,62,...,1 for + * calculating part of s2, and multiplying the data bytes by an all-ones + * sequence 1,1,1,...,1 for calculating s1 and part of s2. The all-ones + * trick seems to be faster than the alternative of vpsadbw + vpaddd. + */ + while (len) { + /* + * Calculate the length of the next data chunk such that s1 and + * s2 are guaranteed to not exceed UINT32_MAX. + */ + size_t n = MIN(len, MAX_CHUNK_LEN & ~(4*VL - 1)); + vec_t mults = VLOAD(raw_mults); + vec_t v_s1 = zeroes; + vec_t v_s2 = zeroes; + + s2 += s1 * n; + len -= n; + + if (n >= 4 * VL) { + vec_t v_s1_b = zeroes; + vec_t v_s1_c = zeroes; + vec_t v_s1_d = zeroes; + vec_t v_s2_b = zeroes; + vec_t v_s2_c = zeroes; + vec_t v_s2_d = zeroes; + vec_t v_s1_sums = zeroes; + vec_t v_s1_sums_b = zeroes; + vec_t v_s1_sums_c = zeroes; + vec_t v_s1_sums_d = zeroes; + vec_t tmp0, tmp1; + + do { + vec_t data_a = VLOADU(p + 0*VL); + vec_t data_b = VLOADU(p + 1*VL); + vec_t data_c = VLOADU(p + 2*VL); + vec_t data_d = VLOADU(p + 3*VL); + + /* + * Workaround for gcc bug where it generates + * unnecessary move instructions + * (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107892) + */ + #if GCC_PREREQ(1, 0) + __asm__("" : "+v" (data_a), "+v" (data_b), + "+v" (data_c), "+v" (data_d)); + #endif + + v_s2 = VDPBUSD(v_s2, data_a, mults); + v_s2_b = VDPBUSD(v_s2_b, data_b, mults); + v_s2_c = VDPBUSD(v_s2_c, data_c, mults); + v_s2_d = VDPBUSD(v_s2_d, data_d, mults); + + v_s1_sums = VADD32(v_s1_sums, v_s1); + v_s1_sums_b = VADD32(v_s1_sums_b, v_s1_b); + v_s1_sums_c = VADD32(v_s1_sums_c, v_s1_c); + v_s1_sums_d = VADD32(v_s1_sums_d, v_s1_d); + + v_s1 = VDPBUSD(v_s1, data_a, ones); + v_s1_b = VDPBUSD(v_s1_b, data_b, ones); + v_s1_c = VDPBUSD(v_s1_c, data_c, ones); + v_s1_d = VDPBUSD(v_s1_d, data_d, ones); + + /* Same gcc bug workaround. See above */ + #if GCC_PREREQ(1, 0) && !defined(ARCH_X86_32) + __asm__("" : "+v" (v_s2), "+v" (v_s2_b), + "+v" (v_s2_c), "+v" (v_s2_d), + "+v" (v_s1_sums), + "+v" (v_s1_sums_b), + "+v" (v_s1_sums_c), + "+v" (v_s1_sums_d), + "+v" (v_s1), "+v" (v_s1_b), + "+v" (v_s1_c), "+v" (v_s1_d)); + #endif + p += 4*VL; + n -= 4*VL; + } while (n >= 4*VL); + + /* + * Reduce into v_s1 and v_s2 as follows: + * + * v_s2 = v_s2 + v_s2_b + v_s2_c + v_s2_d + + * (4*VL)*(v_s1_sums + v_s1_sums_b + + * v_s1_sums_c + v_s1_sums_d) + + * (3*VL)*v_s1 + (2*VL)*v_s1_b + VL*v_s1_c + * v_s1 = v_s1 + v_s1_b + v_s1_c + v_s1_d + */ + tmp0 = VADD32(v_s1, v_s1_b); + tmp1 = VADD32(v_s1, v_s1_c); + v_s1_sums = VADD32_4X(v_s1_sums, v_s1_sums_b, + v_s1_sums_c, v_s1_sums_d); + v_s1 = VADD32_3X(tmp0, v_s1_c, v_s1_d); + v_s2 = VADD32_7X(VSLL32(v_s1_sums, LOG2_VL + 2), + VSLL32(tmp0, LOG2_VL + 1), + VSLL32(tmp1, LOG2_VL), + v_s2, v_s2_b, v_s2_c, v_s2_d); + } + + /* Process the last 0 <= n < 4*VL bytes of the chunk. */ + if (n >= 2*VL) { + const vec_t data_a = VLOADU(p + 0*VL); + const vec_t data_b = VLOADU(p + 1*VL); + + v_s2 = VADD32(v_s2, VSLL32(v_s1, LOG2_VL + 1)); + v_s1 = VDPBUSD(v_s1, data_a, ones); + v_s1 = VDPBUSD(v_s1, data_b, ones); + v_s2 = VDPBUSD(v_s2, data_a, VSET1_8(VL)); + v_s2 = VDPBUSD(v_s2, data_a, mults); + v_s2 = VDPBUSD(v_s2, data_b, mults); + p += 2*VL; + n -= 2*VL; + } + if (n) { + /* Process the last 0 < n < 2*VL bytes of the chunk. */ + vec_t data; + + v_s2 = VADD32(v_s2, VMULLO32(v_s1, VSET1_32(n))); + + mults = VADD8(mults, VSET1_8((int)n - VL)); + if (n > VL) { + data = VLOADU(p); + v_s1 = VDPBUSD(v_s1, data, ones); + v_s2 = VDPBUSD(v_s2, data, mults); + p += VL; + n -= VL; + mults = VADD8(mults, VSET1_8(-VL)); + } + /* + * Process the last 0 < n <= VL bytes of the chunk. + * Utilize a masked load if it's available. + */ + #if USE_MASKING + data = VMASKZ_LOADU((mask_t)-1 >> (VL - n), p); + #else + data = zeroes; + memcpy(&data, p, n); + #endif + v_s1 = VDPBUSD(v_s1, data, ones); + v_s2 = VDPBUSD(v_s2, data, mults); + p += n; + } + + reduce_to_32bits(v_s1, v_s2, &s1, &s2); + s1 %= DIVISOR; + s2 %= DIVISOR; + } +#else /* USE_VNNI */ + /* + * This is Adler-32 for SSE2 and AVX2. + * + * To horizontally sum bytes, use psadbw + paddd, where one of the + * arguments to psadbw is all-zeroes. + * + * For the s2 contribution from (2*VL - i)*data[i] for each of the 2*VL + * bytes of each iteration of the inner loop, use punpck{l,h}bw + paddw + * to sum, for each i across iterations, byte i into a corresponding + * 16-bit counter in v_byte_sums_*. After the inner loop, use pmaddw to + * multiply each counter i by (2*VL - i), then add the products to s2. + * + * An alternative implementation would use pmaddubsw and pmaddwd in the + * inner loop to do (2*VL-i)*data[i] directly and add the products in + * groups of 4 to 32-bit counters. However, on average that approach + * seems to be slower than the current approach which delays the + * multiplications. Also, pmaddubsw requires SSSE3; the current + * approach keeps the implementation aligned between SSE2 and AVX2. + * + * The inner loop processes 2*VL bytes per iteration. Increasing this + * to 4*VL doesn't seem to be helpful here. + */ + while (len) { + /* + * Calculate the length of the next data chunk such that s1 and + * s2 are guaranteed to not exceed UINT32_MAX, and every + * v_byte_sums_* counter is guaranteed to not exceed INT16_MAX. + * It's INT16_MAX, not UINT16_MAX, because v_byte_sums_* are + * used with pmaddw which does signed multiplication. In the + * SSE2 case this limits chunks to 4096 bytes instead of 5504. + */ + size_t n = MIN(len, MIN(2 * VL * (INT16_MAX / UINT8_MAX), + MAX_CHUNK_LEN) & ~(2*VL - 1)); + len -= n; + + if (n >= 2*VL) { + vec_t v_s1 = zeroes; + vec_t v_s1_sums = zeroes; + vec_t v_byte_sums_a = zeroes; + vec_t v_byte_sums_b = zeroes; + vec_t v_byte_sums_c = zeroes; + vec_t v_byte_sums_d = zeroes; + vec_t v_s2; + + s2 += s1 * (n & ~(2*VL - 1)); + + do { + vec_t data_a = VLOADU(p + 0*VL); + vec_t data_b = VLOADU(p + 1*VL); + + v_s1_sums = VADD32(v_s1_sums, v_s1); + v_byte_sums_a = VADD16(v_byte_sums_a, + VUNPACKLO8(data_a, zeroes)); + v_byte_sums_b = VADD16(v_byte_sums_b, + VUNPACKHI8(data_a, zeroes)); + v_byte_sums_c = VADD16(v_byte_sums_c, + VUNPACKLO8(data_b, zeroes)); + v_byte_sums_d = VADD16(v_byte_sums_d, + VUNPACKHI8(data_b, zeroes)); + v_s1 = VADD32(v_s1, + VADD32(VSAD8(data_a, zeroes), + VSAD8(data_b, zeroes))); + /* + * Workaround for gcc bug where it generates + * unnecessary move instructions + * (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107892) + */ + #if GCC_PREREQ(1, 0) + __asm__("" : "+x" (v_s1), "+x" (v_s1_sums), + "+x" (v_byte_sums_a), + "+x" (v_byte_sums_b), + "+x" (v_byte_sums_c), + "+x" (v_byte_sums_d)); + #endif + p += 2*VL; + n -= 2*VL; + } while (n >= 2*VL); + + /* + * Calculate v_s2 as (2*VL)*v_s1_sums + + * [2*VL, 2*VL - 1, 2*VL - 2, ..., 1] * v_byte_sums. + * Then update s1 and s2 from v_s1 and v_s2. + */ + v_s2 = VADD32_5X(VSLL32(v_s1_sums, LOG2_VL + 1), + VMADD16(v_byte_sums_a, mults_a), + VMADD16(v_byte_sums_b, mults_b), + VMADD16(v_byte_sums_c, mults_c), + VMADD16(v_byte_sums_d, mults_d)); + reduce_to_32bits(v_s1, v_s2, &s1, &s2); + } + /* + * Process the last 0 <= n < 2*VL bytes of the chunk using + * scalar instructions, then reduce s1 and s2 mod DIVISOR. + */ + adler32_generic_noreduce(&s1, &s2, p, n); + p += n; + s1 %= DIVISOR; + s2 %= DIVISOR; + } +#endif /* !USE_VNNI */ + return (s2 << 16) | s1; +} + +#undef vec_t +#undef mask_t +#undef LOG2_VL +#undef VADD8 +#undef VADD16 +#undef VADD32 +#undef VDPBUSD +#undef VLOAD +#undef VLOADU +#undef VMADD16 +#undef VMASKZ_LOADU +#undef VMULLO32 +#undef VSAD8 +#undef VSET1_8 +#undef VSET1_32 +#undef VSETZERO +#undef VSLL32 +#undef VUNPACKHI8 +#undef VUNPACKLO8 + +#undef SUFFIX +#undef ATTRIBUTES +#undef VL +#undef USE_VNNI +#undef USE_MASKING diff --git a/lib/x86/cpu_features.h b/lib/x86/cpu_features.h index 5582fcd4..c6eae775 100644 --- a/lib/x86/cpu_features.h +++ b/lib/x86/cpu_features.h @@ -85,6 +85,9 @@ static inline u32 get_x86_cpu_features(void) # if __has_include() # include # endif +# if __has_include() +# include +# endif # if __has_include() # include # endif diff --git a/scripts/checksum_benchmarks.sh b/scripts/checksum_benchmarks.sh index 03707752..12fb405f 100755 --- a/scripts/checksum_benchmarks.sh +++ b/scripts/checksum_benchmarks.sh @@ -198,12 +198,16 @@ echo case $ARCH in i386|x86_64) if have_cpu_features avx512bw avx512_vnni; then - do_benchmark "AVX512BW/AVX512VNNI" + do_benchmark "AVX512VNNI/VL512" + disable_cpu_feature zmm + if have_cpu_features avx512vl; then + do_benchmark "AVX512VNNI/VL256" + fi disable_cpu_feature avx512_vnni "-mno-avx512vnni" disable_cpu_feature avx512bw "-mno-avx512bw" fi if have_cpu_features avx2 avx_vnni; then - do_benchmark "AVX2/AVX-VNNI" + do_benchmark "AVX-VNNI" disable_cpu_feature avx_vnni "-mno-avxvnni" fi if have_cpu_features avx2; then diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh index 51a7f4b0..4419bc06 100755 --- a/scripts/run_tests.sh +++ b/scripts/run_tests.sh @@ -142,8 +142,7 @@ build_and_run_tests() if ! [[ "$CFLAGS" =~ "-march=native" ]] && ! $quick; then case "$ARCH" in i386|x86_64) - features+=(zmm avx_vnni avx512_vnni vpclmulqdq - avx512vl avx512bw avx512f + features+=(zmm avx512_vnni avx512vl avx_vnni vpclmulqdq avx2 avx bmi2 pclmulqdq sse2) ;; arm*|aarch*)